In [1]:
import pandas as pd
import re, string, joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")
fake["class"] = 0
true["class"] = 1
data = pd.concat([fake, true], ignore_index=True)
# Drop unused cols if they exist
for c in ["title","subject","date"]:
    if c in data.columns:
        data.drop(columns=c, inplace=True)

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

data["text"] = data["text"].astype(str).apply(clean_text)

In [4]:
X = data["text"]
y = data["class"]
xtr, xte, ytr, yte = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

tfidf_vect = TfidfVectorizer(
    lowercase=True, stop_words="english",
    ngram_range=(1,2), min_df=5, max_df=0.8
)
xtr_tfidf = tfidf_vect.fit_transform(xtr)
xte_tfidf = tfidf_vect.transform(xte)

In [5]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(xtr_tfidf, ytr)
lr_pred = lr.predict(xte_tfidf)

print("=== Logistic Regression ===")
print(f"Accuracy: {accuracy_score(yte, lr_pred):.4f}")
print(classification_report(yte, lr_pred))

=== Logistic Regression ===
Accuracy: 0.9881
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5871
           1       0.98      0.99      0.99      5354

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [6]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(xtr_tfidf, ytr)
rf_pred = rf.predict(xte_tfidf)

print("=== Random Forest ===")
print(f"Accuracy: {accuracy_score(yte, rf_pred):.4f}")
print(classification_report(yte, rf_pred))


=== Random Forest ===
Accuracy: 0.9922
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5871
           1       0.99      0.99      0.99      5354

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [7]:
joblib.dump(lr, "LR_TFIDF_model.jb")
joblib.dump(rf, "RF_TFIDF_model.jb")
joblib.dump(tfidf_vect, "tfidf_vectorizer.jb")

print("✅ Saved LR_TFIDF_model.jb, RF_TFIDF_model.jb, tfidf_vectorizer.jb")


✅ Saved LR_TFIDF_model.jb, RF_TFIDF_model.jb, tfidf_vectorizer.jb
