In [None]:
# Install necessary libraries silently
!pip install nltk scikit-learn pandas --quiet

# Imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download stopwords (only once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load and prepare dataset
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

true_df['label'] = 1
fake_df['label'] = 0

# Use either 'text' or 'content' column
if 'text' in true_df.columns:
    true_df = true_df[['text', 'label']]
    fake_df = fake_df[['text', 'label']]
elif 'content' in true_df.columns:
    true_df.rename(columns={'content': 'text'}, inplace=True)
    fake_df.rename(columns={'content': 'text'}, inplace=True)
    true_df = true_df[['text', 'label']]
    fake_df = fake_df[['text', 'label']]
else:
    raise ValueError("No text/content column found.")

df = pd.concat([true_df, fake_df], ignore_index=True).dropna().drop_duplicates()

# Fast text cleaning (no stemming)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join([word for word in text.split() if word not in stop_words])

df['clean_text'] = df['text'].astype(str).apply(clean_text)

# TF-IDF + Logistic Regression
X = TfidfVectorizer(max_features=5000).fit_transform(df['clean_text']).toarray()
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Show final report
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3486
           1       0.99      0.99      0.99      4244

    accuracy                           0.99      7730
   macro avg       0.99      0.99      0.99      7730
weighted avg       0.99      0.99      0.99      7730

