In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\z035793\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\z035793\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\z035793\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load your news dataset (Assuming it's in a CSV file)
df = pd.read_csv('news_data.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,news_headline,news_article,news_category
0,0,0,50-year-old problem of biology solved by Artif...,DeepMind's AI system 'AlphaFold' has been reco...,technology
1,1,1,Microsoft Teams to stop working on Internet Ex...,Microsoft Teams will stop working on Internet ...,technology
2,2,2,Hope US won't erect barriers to cooperation: C...,"China, in response to reports of US adding Chi...",technology
3,3,3,Global smartphone sales in Q3 falls 5.7% to 36...,The global smartphone sales in the third quart...,technology
4,4,4,EU hoping Biden will clarify US position on di...,The European Union (EU) is hoping that US Pres...,technology


In [5]:
df["news_category"].value_counts()

news_category
world            2067
entertainment    2036
sports           1900
technology       1791
politics         1596
science          1437
automobile       1293
Name: count, dtype: int64

In [6]:

# Split dataset into training and testing sets
X = df['news_article']
y = df['news_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text preprocessing and vectorization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=word_tokenize)

X_train = [' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word.isalnum()]) for text in X_train]
X_test = [' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word.isalnum()]) for text in X_test]

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [7]:
# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9245049504950495

Classification Report:
                precision    recall  f1-score   support

   automobile       0.86      0.99      0.92       248
entertainment       0.95      0.96      0.96       417
     politics       0.96      0.96      0.96       302
      science       0.95      0.96      0.95       282
       sports       0.99      0.93      0.96       390
   technology       0.91      0.77      0.83       379
        world       0.86      0.94      0.90       406

     accuracy                           0.92      2424
    macro avg       0.93      0.93      0.93      2424
 weighted avg       0.93      0.92      0.92      2424



In [8]:
import joblib

# Save the trained model to a file
joblib.dump(classifier, 'news_classifier_model.pkl')

['news_classifier_model.pkl']

In [22]:
# Function to load the model and make predictions
def predict_category(input_text):
    # Load the saved model
    saved_model = joblib.load('news_classifier_model.pkl')
    
    # Preprocess the input text
    input_text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(input_text.lower()) if word.isalnum()])
    
    # Vectorize the input text
    input_text_tfidf = vectorizer.transform([input_text])
    
    # Predict the category
    predicted_category = saved_model.predict(input_text_tfidf)[0]
    
    return predicted_category

# Example usage
input_text = "US Federal Communications Commission Chairman Ajit Pai has announced that he will step down from his position on January 20. He will be leaving office on the same day as President-elect Joe Biden's inauguration. Previously, former FCC Chairman Tom Wheeler had left office on 20th January 2017, the day President Donald Trump was sworn in."
predicted_category = predict_category(input_text)
print("Predicted Category:", predicted_category)

Predicted Category: world
