1. Load the Preprocessed Dataset

In [None]:
import os
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MaxAbsScaler
import pickle 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
 # Set proxy credentials for NLTK
proxy = "http://edcguest:edcguest@172.31.102.14:3128"
os.environ["http_proxy"] = proxy
os.environ["https_proxy"] = proxy

# Manually set proxy for nltk downloader
nltk.set_proxy(proxy)

# Now download required datasets
nltk.download("stopwords")
stop_words = list(stopwords.words("english"))

# Load preprocessed dataset
df = pd.read_csv("../data/processed/preprocessed_news.csv")
#new line added
df["text"] = df["text"].fillna("")
# Display first few rows
display(df.head())

2. use tf-idf for feature extraction

In [None]:
# import os
# import nltk
# from nltk.corpus import stopwords
# from sklearn.preprocessing import MaxAbsScaler

# Set proxy credentials for NLTK
# proxy = "http://edcguest:edcguest@172.31.102.14:3128"
# os.environ["http_proxy"] = proxy
# os.environ["https_proxy"] = proxy

# # Manually set proxy for nltk downloader
# nltk.set_proxy(proxy)

# # Now download required datasets
# nltk.download("stopwords")
# stop_words = list(stopwords.words("english"))

# from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer(
#     max_features=5000,  # Use top 5000 words
#     stop_words=stop_words,  # Remove common words
#     ngram_range=(1,2)  # Include unigrams and bigrams
# )
  # Use top 5000 words

# Fit & transform the text data
# X = tfidf_vectorizer.fit_transform(df["text"])

# # Convert to DataFrame for better visualization
# X_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# scaler = MaxAbsScaler()
# X_scaled = scaler.fit_transform(X)

# # Display feature matrix shape
# print("Feature matrix shape:", X_df.shape)

#new code (corrected code here)
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Use top 5000 words
    stop_words=stop_words,  # Remove common words
    ngram_range=(1,2)  # Include unigrams and bigrams
)

# Fit & transform the text data
X = tfidf_vectorizer.fit_transform(df["text"])

# Convert to DataFrame for better visualization
X_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
# Display feature matrix shape
print("Feature matrix shape:", X_df.shape)


3. prepare labels for model training

In [None]:
# Labels (target variable)
y = df["label"]

# Save the transformed features for later use
# import pickle
# ✅ Save the fitted TF-IDF Vectorizer (ADDED HERE)
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)
print("✅ TF-IDF Vectorizer saved successfully!")
with open("../data/processed/tfidf_features.pkl", "wb") as f:
    pickle.dump(X, f)

with open("../data/processed/labels.pkl", "wb") as f:
    pickle.dump(y, f)

print("✅ Features and labels saved!")

4. Top 20 words with highest tf-idf

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature names and their mean TF-IDF scores
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
mean_tfidf_scores = np.asarray(X.mean(axis=0)).flatten()

# Sort words by TF-IDF score
top_n = 20  # Top N words
sorted_indices = np.argsort(mean_tfidf_scores)[::-1][:top_n]

# Plot
plt.figure(figsize=(12, 5))
plt.barh(feature_names[sorted_indices], mean_tfidf_scores[sorted_indices], color="blue")
plt.xlabel("Mean TF-IDF Score")
plt.ylabel("Words")
plt.title(f"Top {top_n} Words with Highest TF-IDF Scores")
plt.gca().invert_yaxis()  # Invert axis for better readability
plt.show()

5. tf-idf word cloud

In [None]:
from wordcloud import WordCloud

# Generate word cloud using TF-IDF scores
word_scores = dict(zip(feature_names, mean_tfidf_scores))
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_scores)

# Plot
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("TF-IDF Word Cloud")
plt.show()

6. Load features for training

In [None]:
# import pickle
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# Load the TF-IDF feature matrix
with open("../data/processed/tfidf_features.pkl", "rb") as f:
    X = pickle.load(f)

# Load the labels
with open("../data/processed/labels.pkl", "rb") as f:
    y = pickle.load(f)

print("✅ Features and labels loaded!")

7. split data into train and test sets

In [None]:
#SPLITTING THE DATASET
# Split into 80% training and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Data: {X_train.shape}, Test Data: {X_test.shape}")
print("Train Labels Distribution:", np.bincount(y_train))
print("Test Labels Distribution:", np.bincount(y_test))

8. train and compare all models

In [None]:
# List of models to train
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver="liblinear"),
    "Naïve Bayes": MultinomialNB(),
    "SVM": SVC(kernel="linear"),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}
results = {}

for name, model in models.items():
    print(f"\n🔵 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy  # Store accuracy
    
    print(f" {name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

# Find best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print(f"\n Best Model: {best_model_name} with Accuracy: {results[best_model_name]:.4f}")

# Save the best model
with open("../models/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved successfully!")

9. save the model for future use

In [None]:
with open("../models/best_model.pkl", "rb") as f:
    best_model = pickle.load(f)

# Load the saved TF-IDF vectorizer
with open("../models/tfidf_vectorizer.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

# Take user input
news_text = input("Enter the news article to check if it's Fake or True:\n")

# Convert text to TF-IDF features
news_tfidf = tfidf_vectorizer.transform([news_text])

# Make prediction
prediction = best_model.predict(news_tfidf)

# Display result
if prediction == 1:
    print("\n🛑 The news is FAKE 🛑")
else:
    print("\n✅ The news is TRUE ✅")
# print("Prediction:", prediction)