In [10]:
 import kagglehub

 # Download latest version
 path = kagglehub.dataset_download("rmisra/news-category-dataset")
 print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/news-category-dataset


In [11]:
#1.Combine headline and short_description
df = df[["headline", "short_description", "category"]].dropna()
df["text"] = df["headline"] + " " + df["short_description"]

In [12]:
# 2. Features and labels
X = df["text"]
y = df["category"]

In [14]:
# 3. Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# 4. TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [16]:
# 5. Train logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

In [17]:
# 6. Evaluate model
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

          ARTS       0.40      0.19      0.25       293
ARTS & CULTURE       0.33      0.10      0.16       275
  BLACK VOICES       0.45      0.30      0.36       889
      BUSINESS       0.47      0.44      0.46      1216
       COLLEGE       0.45      0.30      0.36       202
        COMEDY       0.54      0.37      0.44      1022
         CRIME       0.51      0.52      0.52       713
CULTURE & ARTS       0.58      0.22      0.32       202
       DIVORCE       0.79      0.65      0.71       664
     EDUCATION       0.46      0.29      0.36       209
 ENTERTAINMENT       0.52      0.74      0.61      3419
   ENVIRONMENT       0.66      0.19      0.30       313
         FIFTY       0.33      0.08      0.13       263
  FOOD & DRINK       0.59      0.71      0.65      1270
     GOOD NEWS       0.41      0.13      0.19       270
         GREEN       0.38      0.30      0.33       532
HEALTHY LIVING       0.39      0.20      0.27  

In [18]:

def predict_category(news_text):
    text_vector = vectorizer.transform([news_text])
    prediction = model.predict(text_vector)
    return prediction[0]

# Example
print(predict_category(" "))

ENTERTAINMENT


In [19]:
print(sorted(df['category'].unique()))

['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'THE WORLDPOST', 'TRAVEL', 'U.S. NEWS', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']


In [20]:
import pickle

with open("news_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)