In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
razanaqvi14_real_and_fake_news_path = kagglehub.dataset_download('razanaqvi14/real-and-fake-news')

print('Data source import complete.')


---

##  **Dataset Overview: Fake and Real News Dataset**

###  **Description:**

This dataset consists of two separate CSV files containing news articles:

* `True.csv`: Real and factual news stories
* `Fake.csv`: Fabricated or misleading news stories

Each file contains news entries with metadata and full article content.
The goal is to build a machine learning or deep learning model that can distinguish between **real** and **fake** news.

---

##  **Dataset Structure After Merge:**

After preprocessing and merging, the final dataset has **two main columns**:

| Column Name | Type   | Description                                         |
| ----------- | ------ | --------------------------------------------------- |
| `content`   | string | Combination of `title` and `text` (full news text)  |
| `label`     | int    | Target label — `1` for real news, `0` for fake news |

---

##  **Original Columns (Before Merging):**

| Column Name | Type   | Description                                                                    |
| ----------- | ------ | ------------------------------------------------------------------------------ |
| `title`     | string | The headline of the article                                                    |
| `text`      | string | The body/content of the news article                                           |
| `subject`   | string | *(Only in some versions)* Indicates the topic category (e.g., politics, world) |
| `date`      | string | The date when the article was published                                        |

---

#  Required Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

## Load and Combine the Datasets

### Load Datasets

In [None]:
true_df = pd.read_csv("/kaggle/input/real-and-fake-news/True.csv")
fake_df = pd.read_csv("/kaggle/input/real-and-fake-news/Fake.csv")

### Add Labels

In [None]:
true_df["label"] = 1  # Real news
fake_df["label"] = 0  # Fake news

### Combine Datasets

In [None]:
df = pd.concat([true_df, fake_df], ignore_index=True)

### Check the Structure

In [None]:
print("Shape:", df.shape)

In [None]:
print("Columns:", df.columns)

In [None]:
print(df["label"].value_counts())

## Drop Unnecessary Columns and Preprocess

### Drop 'date' and keep 'title' + 'text'

In [None]:
df = df[["title", "text", "label"]]

### Create a new column 'content' combining title and text

In [None]:
df["content"] = df["title"] + " " + df["text"]

### Drop rows with null values

In [None]:
df.dropna(inplace=True)

### Shuffle the dataset

In [None]:
df = df.sample(frac=1, random_state=537).reset_index(drop=True)

### Final Columns

In [None]:
df = df[["content", "label"]]

### Preview

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Text Cleaning & EDA

## Text Cleaning

### Text Cleaning Function

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()]
    return " ".join(words)


### Apply cleaning

In [None]:
df["clean_text"] = df["content"].apply(clean_text)

### Show sample

In [None]:
df[["content", "clean_text", "label"]].head(2)

## WordCloud: Real vs Fake News

### Split by label

In [None]:
real_text = " ".join(df[df["label"] == 1]["clean_text"])
fake_text = " ".join(df[df["label"] == 0]["clean_text"])

### WordCloud for Real News

In [None]:
plt.figure(figsize=(12,6))
plt.title("WordCloud - Real News", fontsize=16)
plt.imshow(WordCloud(width=800, height=400, background_color="white").generate(real_text))
plt.axis("off")
plt.show()

### WordCloud for Fake News

In [None]:
plt.figure(figsize=(12,6))
plt.title("WordCloud - Fake News", fontsize=16)
plt.imshow(WordCloud(width=800, height=400, background_color="white").generate(fake_text))
plt.axis("off")
plt.show()

## Distribution of News Length

### Add length column

In [None]:
df["text_length"] = df["clean_text"].apply(lambda x: len(x.split()))

### Plot distributions

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df, x="text_length", hue="label", bins=50, kde=True, palette="Set1")
plt.title("Distribution of News Length (Real vs Fake)", fontsize=14)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.legend(["Fake", "Real"])
plt.show()

# Feature Extraction

## TF-IDF Vectorization

### TF-IDF Vectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

### Fit-transform the clean text

In [None]:
X = tfidf.fit_transform(df["clean_text"])

### Target variable

In [None]:
y = df["label"]

In [None]:
print("TF-IDF shape:", X.shape)

## Train/Test Split

### Split the dataset (80% train, 20% test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=537)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

# ML Model Training & Evaluation

## Model Initialization

### Dictionary of ML models

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC()
}

## Model Training, Prediction, Evaluation

### List to store results

In [None]:
results = []

### Loop through each model

In [None]:
for name, model in models.items():
    print(f"Training model: {name}")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"\n Accuracy of {name}: {acc:.4f}")

    # Print classification report
    print(classification_report(y_test, y_pred, target_names=["Fake", "Real"]))

    # Store result
    results.append((name, acc))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Fake", "Real"], yticklabels=["Fake", "Real"])
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

## ROC Curve for Probabilistic Models

In [None]:
plt.figure(figsize=(10, 6))

for name, model in models.items():
    try:
        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, "decision_function"):
            y_scores = model.decision_function(X_test)
        else:
            continue

        fpr, tpr, _ = roc_curve(y_test, y_scores)
        auc_score = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc_score:.2f})")
    except:
        continue

### Baseline

In [None]:
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for ML Models")
plt.legend()
plt.grid(True)
plt.show()

## Accuracy Comparison Chart

### Create dataframe for results

In [None]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
results_df = results_df.sort_values(by="Accuracy", ascending=False)

### Plot

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x="Accuracy", y="Model", data=results_df, palette="mako")
plt.title("Model Accuracy Comparison")
plt.xlim(0.8, 1.0)
plt.xlabel("Accuracy")
plt.ylabel("Model")
plt.grid(True)
plt.show()

### Display results

In [None]:
results_df

# Deep Learning Model with LSTM

## Text Tokenization & Padding

### Parameters

In [None]:
MAX_VOCAB = 10000       # max number of words in the vocabulary
MAX_LEN = 300           # max length of sequences (number of words per input)

### Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_text"])

### Convert text to sequences

In [None]:
sequences = tokenizer.texts_to_sequences(df["clean_text"])

### Pad sequences to fixed length

In [None]:
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")

### Labels

In [None]:
labels = df["label"].values

## Train-Test Split

### Split the padded sequences and labels

In [None]:
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=537)

## Define the LSTM Model

### Define the model

In [None]:
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=128, input_length=MAX_LEN),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # binary classification
])

### Compile the model

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Model summary

In [None]:
model.summary()

## Train the Model

In [None]:
history = model.fit(
    X_train_dl, y_train_dl,
    validation_data=(X_test_dl, y_test_dl),
    epochs=5,
    batch_size=256,
    verbose=1
)

## Visualize Accuracy and Loss

### Plot accuracy

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(history.history['accuracy'], label="Train Accuracy")
plt.plot(history.history['val_accuracy'], label="Validation Accuracy")
plt.title("LSTM Model Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

### Plot loss

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label="Train Loss")
plt.plot(history.history['val_loss'], label="Validation Loss")
plt.title("LSTM Model Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

## Evaluate the Model on Test Data

### Evaluate performance

In [None]:
loss, accuracy = model.evaluate(X_test_dl, y_test_dl)
print(f"\nFinal Test Accuracy: {accuracy:.4f}")

# Predicting on New Inputs

## Prediction Function

In [None]:
def predict_news(text, model, tokenizer, max_len=300):

    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    def clean_input(t):
        t = t.lower()
        t = t.translate(str.maketrans('', '', string.punctuation))
        words = nltk.word_tokenize(t)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()]
        return " ".join(words)

    # Preprocess
    cleaned = clean_input(text)
    sequence = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    # Prediction
    prob = model.predict(padded)[0][0]
    label = "REAL" if prob > 0.5 else "FAKE"
    confidence = prob if prob > 0.5 else 1 - prob

    print(f" Text: {text}")
    print(f" Prediction: {label} ({confidence:.2f} confidence)")

## Example Predictions

In [None]:
examples = [
    "The CDC approves a new vaccine that prevents aging by 40%.",
    "NASA's James Webb Telescope captures image of the first galaxies.",
    "Apple announces the iPhone 25 will be implanted into the brain.",
    "UN releases a global climate report showing record-breaking heat levels in 2024.",
    "Scientists confirm that drinking bleach can boost your immune system.",
    "The World Bank predicts economic recovery in developing countries by 2026.",
]

for i, example in enumerate(examples, 1):
    print(f"\n Example {i}")
    predict_news(example, model, tokenizer)

# Final Evaluation – ML vs DL Comparison

## Visual Comparison

### Add DL model result

In [None]:
loss, lstm_accuracy = model.evaluate(X_test_dl, y_test_dl, verbose=0)
print(f"LSTM Model Accuracy: {lstm_accuracy:.4f}")
lstm_result = pd.DataFrame([{"Model": "LSTM", "Accuracy": lstm_accuracy}])
results_df = pd.concat([results_df, lstm_result], ignore_index=True)

### Plot

In [None]:
plt.figure(figsize=(9, 5))
sns.barplot(x="Accuracy", y="Model", data=results_df.sort_values(by="Accuracy", ascending=False), palette="crest")
plt.title("Model Accuracy Comparison (ML vs DL)")
plt.xlim(0.5, 1.00)
plt.grid(True)
plt.show()

---

##  **Project Summary: Fake News Detection Using Optimized LSTM**

---

###  1. **Data Preparation**

* Loaded two datasets: `True.csv` (real news) and `Fake.csv` (fake news).
* Added a `label` column: `1` for real news, `0` for fake news.
* Combined `title` and `text` into a single column: `content`.
* Cleaned null values, shuffled the dataset, and kept only `content` and `label` columns for modeling.

---

###  2. **Text Cleaning and Exploratory Data Analysis (EDA)**

* Converted text to lowercase, removed punctuation, stopwords, and applied lemmatization.
* Generated WordClouds to visualize most common words in real vs fake news.
* Compared article lengths with histograms to observe structural differences.

---

###  3. **Feature Extraction with TF-IDF**

* Applied TF-IDF vectorization with:

  * `max_features = 5000`
  * `ngram_range = (1, 2)` for unigrams and bigrams.
* Converted text into numerical format.
* Split data into training (80%) and testing (20%).

---

###  4. **ML Models (Optional Comparison)**

* Trained Logistic Regression, Naive Bayes, Random Forest, and SVM.
* Evaluated with confusion matrices and ROC curves.
* Though ML models performed well, the final classification relied on LSTM.

---

###  5. **Final LSTM Architecture **

* LSTM handles the **sequential structure** of language better than traditional models.
* Dropout layers help **prevent overfitting**.
* Efficient architecture for both CPU and GPU environments.
* Strikes a balance between **simplicity and performance**.

---

###  6. **Training and Evaluation**

* Model trained for 5 epochs.
* Accuracy and loss visualized for both training and validation sets.
* Final accuracy retrieved using `model.evaluate()` (e.g., `LSTM Accuracy: 0.9732`)

---

###  7. **Prediction on Mixed News Samples**

* Tested the model on 6 real-world inspired news headlines (some fake, some real).
* The model confidently classified them with over 90% accuracy.
* Each result included predicted class (REAL/FAKE) and confidence score.

---

# Thank you for taking the time to review my work. I would be very happy if you could upvote! 😊

---