In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
jainpooja_fake_news_detection_path = kagglehub.dataset_download('jainpooja/fake-news-detection')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Introduction**
In an era of rapid social media and online news dissemination, fake news poses a threat to public perception and social stability. This project leverages natural language processing (NLP) and machine learning technologies to develop a model that automatically determines whether news is true or false based on its context.

# **Data Processing**

In [None]:
#import the packages
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

In [None]:
#instull the data
df_fake = pd.read_csv("../input/fake-news-detection/Fake.csv")
df_true = pd.read_csv("../input/fake-news-detection/True.csv")
#check the fake news dataset
df_fake.head()

In [None]:
# check the true news dataset
df_true.head()

In [None]:
#check the dataset size
print(df_fake.shape, df_true.shape)

In [None]:
# add a category field(target)
df_fake["class"] = 0 #fake news
df_true["class"] = 1 #true news

In [None]:
# check the dataset again
print(df_fake.shape, df_true.shape)

In [None]:
# combine both data and shuffle
df = pd.concat([df_fake, df_true], axis=0)
# shuffle
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
#check
df.head()

# Text data pre-processing

In [None]:
# Define a text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# Apply cleaning to title and text
df["title"] = df["title"].apply(clean_text)
df["text"] = df["text"].apply(clean_text)

# Feature merging and modeling preparation

In [None]:
#Combine title and text into one feature
df["content"] = df["title"] + " " + df["text"]

In [None]:
#Define features and target
X = df["content"]
y = df["class"]

In [None]:
#Split the data (80% trianing, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

**TF-IDF**

TF-IDF is a text vectorization technique used in NLP (Natural Language Processing) for feature extraction.

1. stop_words='english': Automatically filter out common nonsense words in English (such as "the", "and", "is", etc.)
2. max_df: ignore words that appear in more than x% of the documents
3. min_df: Ignore words that appear in less than x documents
4. ngram_range=(x,y): extract single words and double-word combinations at the same time

In [None]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
Xv_train = vectorizer.fit_transform(X_train)
Xv_test = vectorizer.transform(X_test)

# **Model training and evaluation**

TF-IDF works well with Logistic Regression, Random Forest, naive Bayes, and SVM; we will test the Logistic and Random Forest

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(Xv_train, y_train)
pred_lr = lr.predict(Xv_test)

print("Logistic Regression Results:\n")
print(classification_report(y_test, pred_lr))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(Xv_train, y_train)
pred_rfc = rfc.predict(Xv_test)

print("Random Forest Results:\n")
print(classification_report(y_test, pred_rfc))

# Model comparison visualization

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Example for Random Forest
cm = confusion_matrix(y_test, pred_rfc)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.show()

In [None]:
X_title = df['title']
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_title, y, test_size=0.25, random_state=42)

vectorizer_title = TfidfVectorizer()
Xv_train_t = vectorizer_title.fit_transform(X_train_t)
Xv_test_t = vectorizer_title.transform(X_test_t)

lr_title = LogisticRegression()
lr_title.fit(Xv_train_t, y_train_t)
pred_title = lr_title.predict(Xv_test_t)

print("Title-only Logistic Regression:\n")
print(classification_report(y_test_t, pred_title))

# Analyzing Common Keywords in Fake News

**Logistic Regression**

In [None]:
# Get vocabulary names
feature_names = vectorizer.get_feature_names_out()

# Get the weight of each word (contribution to "class=1" → real news)
coefs = lr.coef_[0]

# Create a dataframe
coef_df = pd.DataFrame({
    'term': feature_names,
    'coef': coefs
})

# Sort: from most negative (fake) to most positive (real)
top_fake_words = coef_df.sort_values(by='coef').head(20)
#top_real_words = coef_df.sort_values(by='coef').tail(20)

print("Top 20 keywords that indicate FAKE news:")
print(top_fake_words)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(top_fake_words['term'][::-1], top_fake_words['coef'][::-1], color='crimson')
plt.xlabel("Logistic Regression Coefficient")
plt.title("Top 20 Keywords Indicating FAKE News")
plt.tight_layout()
plt.show()

****

**Random Forest**

In [None]:
# Get feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get feature importances from the trained Random Forest
importances = rfc.feature_importances_

# Create a DataFrame to store them
rf_feature_df = pd.DataFrame({
    'term': feature_names,
    'importance': importances
})

# Sort by importance
rf_top_words = rf_feature_df.sort_values(by='importance', ascending=False).head(20)

# Display
print("Top 20 important words in Random Forest (TF-IDF features):")
print(rf_top_words)

# Use SHAP (SHapley Additive exPlanations) to "explain model predictions"

In [None]:
import shap

In [None]:
import random
# Get feature names from the TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Convert the entire training set to dense array for SHAP
X_train_dense = Xv_train.toarray()

# Create SHAP explainer for logistic regression model
explainer = shap.Explainer(lr, X_train_dense, feature_names=feature_names)

# Randomly select one sample from the test set
random_index = random.randint(0, Xv_test.shape[0] - 1)
x_sample = Xv_test[random_index].toarray()
y_sample = y_test.iloc[random_index]

# Explain the prediction for the selected sample
shap_values = explainer(x_sample)

# Show SHAP contribution bar plot
shap.plots.bar(shap_values)

In [None]:
#print the true label and predicted label
y_pred = lr.predict(x_sample)[0]
print(f"True label: {y_sample} → {'REAL' if y_sample == 1 else 'FAKE'}")
print(f"Predicted label: {y_pred} → {'REAL' if y_pred == 1 else 'FAKE'}")

In [None]:
#How each word contributes to this article being judged as FAKE
shap.plots.waterfall(shap_values[0])