In [1]:
import pandas as pd 

# loading the dataset 
df=pd.read_csv("C:/Users/Prasad/Downloads/fake_news_Navyclassification.csv")
df

Unnamed: 0,Article_ID,Title,Content_Snippet,Source,Date_Published,Label
0,1,Govt launches new health scheme,Official announcement by health ministry,Times of India,05-01-2025,Real
1,2,Actor spotted on Mars!,Viral post claims Bollywood star seen on Mars,Facebook,14-02-2025,Fake
2,3,RBI cuts repo rate by 0.25%,Central bank decision impacts loan rates,Economic Times,01-03-2025,Real
3,4,Drinking coffee cures cancer,Forwarded WhatsApp message with false claims,WhatsApp,20-01-2025,Fake
4,5,Record rainfall in Mumbai,IMD report confirms heavy downpour,The Hindu,22-07-2025,Real
5,6,Alien spacecraft found in Himalayas,Local villager claims UFO crash landed,YouTube Channel,10-04-2025,Fake
6,7,India wins cricket world cup,Match report with score details,ESPN,30-06-2025,Real
7,8,Eating garlic makes you invisible,Social media rumor spreads quickly,Twitter,05-02-2025,Fake
8,9,Stock markets hit record high,Sensex and Nifty cross all-time peak,Business Standard,12-05-2025,Real
9,10,Water turns into petrol in Rajasthan,Viral hoax video shared widely,WhatsApp,19-03-2025,Fake


In [2]:
import numpy as np
import re

# load df if needed
# df = pd.read_csv("fake_news_dataset.csv")

# Basic checks
print(df.shape)
print(df.isnull().sum())

# Drop rows with no text (if any)
df = df.dropna(subset=['Title', 'Content_Snippet', 'Label']).reset_index(drop=True)

# Normalize Label to binary
df['Label'] = df['Label'].map({'Real':1, 'Fake':0})

# Combine title + snippet into one text field (common practice)
df['text'] = (df['Title'].fillna('') + ' . ' + df['Content_Snippet'].fillna('')).str.strip()

# Basic text cleaning function
def clean_text(s):
    s = s.lower()
    s = re.sub(r'https?://\S+|www\.\S+', '', s)   # remove urls
    s = re.sub(r'[^a-z0-9\s]', ' ', s)            # remove punctuation (keep spaces)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['text_clean'] = df['text'].apply(clean_text)


(30, 6)
Article_ID         0
Title              0
Content_Snippet    0
Source             0
Date_Published     0
Label              0
dtype: int64


In [3]:
# feature Engineering 
# Text length features
df['title_len'] = df['Title'].fillna('').apply(len)
df['snippet_len'] = df['Content_Snippet'].fillna('').apply(len)
df['text_len'] = df['text_clean'].apply(len)
df['num_exclamations'] = df['text'].str.count('!')   # if you kept punctuation

# Source popularity / credibility (toy example)
# If you have a mapping of credible sources, create source_score; else one-hot encode later.
trusted_sources = ['Times of India','The Hindu','Economic Times','NDTV','Business Standard','Indian Express','ISRO Website','ESPN']
df['source_trusted'] = df['Source'].isin(trusted_sources).astype(int)


In [4]:
# feature scalling 
from sklearn.preprocessing import StandardScaler

# example numeric features for GaussianNB
num_feats = ['title_len','snippet_len','text_len','num_exclamations','source_trusted']
scaler = StandardScaler()
df[num_feats] = scaler.fit_transform(df[num_feats])


In [5]:
from sklearn.model_selection import train_test_split

X_text = df['text_clean']
X_meta = df[num_feats]  # numeric meta features
y = df['Label']

X_text_train, X_text_test, X_meta_train, X_meta_test, y_train, y_test = train_test_split(
    X_text, X_meta, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pipe_mnb = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=1)),  # TF-IDF works well with MultinomialNB
    ('clf', MultinomialNB())
])

pipe_mnb.fit(X_text_train, y_train)
y_pred_mnb = pipe_mnb.predict(X_text_test)

print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))
print(classification_report(y_test, y_pred_mnb))
print(confusion_matrix(y_test, y_pred_mnb))


MultinomialNB Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

[[3 0]
 [0 3]]


In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

pipe_bnb = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2), binary=True, min_df=1)),
    ('clf', BernoulliNB())
])

pipe_bnb.fit(X_text_train, y_train)
y_pred_bnb = pipe_bnb.predict(X_text_test)

print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))
print(classification_report(y_test, y_pred_bnb))
print(confusion_matrix(y_test, y_pred_bnb))


In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

# Convert TF-IDF to dense + standardize numeric meta
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=1)

X_train_tfidf = tfidf.fit_transform(X_text_train).toarray()   # small dataset OK
X_test_tfidf = tfidf.transform(X_text_test).toarray()

# concatenate with meta numeric features
import numpy as np
X_train_gauss = np.hstack([X_train_tfidf, X_meta_train.values])
X_test_gauss  = np.hstack([X_test_tfidf, X_meta_test.values])

# Standardize (important for GaussianNB)
scaler = StandardScaler()
X_train_gauss = scaler.fit_transform(X_train_gauss)
X_test_gauss = scaler.transform(X_test_gauss)

gnb = GaussianNB(var_smoothing=1e-9)
gnb.fit(X_train_gauss, y_train)
y_pred_gnb = gnb.predict(X_test_gauss)

print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))
print(classification_report(y_test, y_pred_gnb))
print(confusion_matrix(y_test, y_pred_gnb))


GaussianNB Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

[[3 0]
 [0 3]]


In [8]:
from sklearn.model_selection import GridSearchCV

param_grid_mnb = {
    'tfidf__ngram_range': [(1,1),(1,2)],
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'clf__alpha': [0.1, 0.5, 1.0]
}

grid_mnb = GridSearchCV(pipe_mnb, param_grid_mnb, cv=5, scoring='f1', n_jobs=-1)
grid_mnb.fit(X_text_train, y_train)
print("Best params MNB:", grid_mnb.best_params_)
print("Best CV score (F1):", grid_mnb.best_score_)

best_mnb = grid_mnb.best_estimator_
y_pred_best = best_mnb.predict(X_text_test)
print("Tuned MNB report:")
print(classification_report(y_test, y_pred_best))


Best params MNB: {'clf__alpha': 0.1, 'tfidf__max_df': 0.8, 'tfidf__ngram_range': (1, 1)}
Best CV score (F1): 0.8933333333333333
Tuned MNB report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

