In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [2]:
import os, glob

print("True.csv hits:", glob.glob('/content/drive/MyDrive/**/True.csv', recursive=True)[:10])
print("Fake.csv hits:", glob.glob('/content/drive/MyDrive/**/Fake.csv', recursive=True)[:10])

TRUE_PATH = '/content/drive/MyDrive/True.csv'
FAKE_PATH = '/content/drive/MyDrive/Fake.csv'

print("TRUE exists? ", os.path.exists(TRUE_PATH), "->", TRUE_PATH)
print("FAKE exists? ", os.path.exists(FAKE_PATH), "->", FAKE_PATH)


True.csv hits: ['/content/drive/MyDrive/True.csv']
Fake.csv hits: ['/content/drive/MyDrive/Fake.csv']
TRUE exists?  True -> /content/drive/MyDrive/True.csv
FAKE exists?  True -> /content/drive/MyDrive/Fake.csv


In [3]:
from pathlib import Path

TRUE_CSV = Path(TRUE_PATH)
FAKE_CSV = Path(FAKE_PATH)

assert TRUE_CSV.exists(), f"Missing: {TRUE_CSV}"
assert FAKE_CSV.exists(), f"Missing: {FAKE_CSV}"

print("Using:", TRUE_CSV, FAKE_CSV)


Using: /content/drive/MyDrive/True.csv /content/drive/MyDrive/Fake.csv


**Imports and basic settings**:

In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


**Load ISOT Fake/True dataset:**

In [5]:
def load_isot(fake_path, true_path):
    fake_df = pd.read_csv(fake_path)
    true_df = pd.read_csv(true_path)

    fake_df["label"] = "fake"
    true_df["label"] = "true"

    # Combine two dataframes
    df = pd.concat([fake_df, true_df], ignore_index=True)

    # One text column = title + body
    df["text_full"] = df["title"].fillna("") + " " + df["text"].fillna("")

    # Drop exact duplicate articles
    df = df.drop_duplicates(subset=["title", "text"])

    # Keep only the columns we care about now
    df = df[["text_full", "label", "subject", "date"]]

    return df


# use the Path objects we already created
df = load_isot(FAKE_CSV, TRUE_CSV)

print(df.head())
print("\nLabel counts:")
print(df["label"].value_counts())


                                           text_full label subject  \
0   Donald Trump Sends Out Embarrassing New Year’...  fake    News   
1   Drunk Bragging Trump Staffer Started Russian ...  fake    News   
2   Sheriff David Clarke Becomes An Internet Joke...  fake    News   
3   Trump Is So Obsessed He Even Has Obama’s Name...  fake    News   
4   Pope Francis Just Called Out Donald Trump Dur...  fake    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  

Label counts:
label
true    21197
fake    17908
Name: count, dtype: int64


**Train / validation / test split:**

In [6]:
X = df["text_full"].values
y = df["label"].values

from sklearn.model_selection import train_test_split

# Train 70% vs temp 30%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)

# Validation 15% vs Test 15%
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=RANDOM_STATE
)

print("Train size:", len(X_train))
print("Valid size:", len(X_valid))
print("Test size:", len(X_test))


Train size: 27373
Valid size: 5866
Test size: 5866


**Preprocessing and vocabulary:**

In [12]:
import re
TOKEN_PATTERN = re.compile(r"[a-z]+")   # regex pattern to extract alphabetic tokens only (a–z)

def preprocess_text(text):
    """
    Lowercase, remove URLs, keep alphabetic tokens only.
    """
    text = text.lower()   # convert entire text to lowercase for consistency
    text = re.sub(r"http\S+|www\.\S+", " ", text)   # remove URLs and replace them with spaces
    tokens = TOKEN_PATTERN.findall(text)   # extract alphabetic tokens using the regex pattern
    return tokens    # return list of cleaned tokens


def build_vocabulary(texts, min_freq=5):
    """
    Build a word -> index dictionary from training texts.
    Only keep words that appear at least min_freq times.
    """
    freq = {}   # dictionary to count token frequencies across all training texts

    for t in texts:
        tokens = preprocess_text(t)   # preprocess each document
        for tok in tokens:
            freq[tok] = freq.get(tok, 0) + 1   # increment count or initialize to 1

    vocab = {}   # final vocabulary mapping token -> index
    idx = 0      # index counter
    for word, count in freq.items():
        if count >= min_freq:     # keep only words that occur enough times
            vocab[word] = idx     # assign index to the word
            idx += 1              # increment index for next word

    return vocab   # return constructed vocabulary dictionary


def vectorize_tokens(tokens, vocab):
    """
    Convert list of tokens into a count vector.
    """
    vec = np.zeros(len(vocab), dtype=np.int32)   # initialize vector of zeros, length = vocab size
    for tok in tokens:
        if tok in vocab:         # check if token exists in vocabulary
            j = vocab[tok]       # get index of the token
            vec[j] += 1          # increment count in the corresponding position
    return vec    # return the bag-of-words count vector


# Build vocabulary from the training set
vocab = build_vocabulary(X_train, min_freq=5)   # construct vocabulary using training data
print("Vocabulary size:", len(vocab))           # print number of unique tokens kept


Vocabulary size: 35464


**From-scratch Multinomial Naive Bayes:**

In [13]:
class MultinomialNBScratch:
    """
    Simple Multinomial Naive Bayes for text classification.
    """

    def __init__(self, alpha=1.0):
        self.alpha = alpha                      # Laplace smoothing parameter
        self.classes_ = None                    # stores the class labels (e.g., ["fake", "true"])
        self.class_priors_ = None               # P(class) for each class
        self.feature_log_prob_ = None           # log P(word | class) matrix
        self.vocab_ = None                      # reference to the vocabulary used

    def fit(self, texts, labels, vocab):
        """
        Train the Naive Bayes model on the given texts and labels.
        """
        self.vocab_ = vocab                     # save the vocabulary
        n_docs = len(texts)                     # number of documents in the training set

        # find unique classes and map labels to numeric indices
        self.classes_, y_indices = np.unique(labels, return_inverse=True)
        n_classes = len(self.classes_)          # number of classes (2 in our case)
        n_features = len(vocab)                 # size of vocabulary

        # initialize word count matrix [num_classes x vocab_size]
        class_word_counts = np.zeros((n_classes, n_features), dtype=np.int64)
        class_doc_counts = np.zeros(n_classes, dtype=np.int64)   # document count per class

        # loop over every training document
        for i, text in enumerate(texts):
            c_idx = y_indices[i]                # index of the class for this document
            class_doc_counts[c_idx] += 1        # count how many docs belong to each class

            tokens = preprocess_text(text)      # preprocess text into tokens
            vec = vectorize_tokens(tokens, vocab)   # convert tokens into a count vector
            class_word_counts[c_idx] += vec     # add counts to this class's word totals

        # compute prior probabilities P(class)
        self.class_priors_ = class_doc_counts / n_docs

        # apply Laplace smoothing to word counts
        alpha = self.alpha
        smoothed = class_word_counts + alpha

        # sum of word counts for each class (needed for normalization)
        class_totals = smoothed.sum(axis=1).reshape(-1, 1)

        # compute conditional probabilities P(word | class) in log-space
        self.feature_log_prob_ = np.log(smoothed / class_totals)

    def _log_posterior(self, text):
        """
        Compute log posterior probabilities for each class for a single document.
        """
        tokens = preprocess_text(text)                 # preprocess text
        vec = vectorize_tokens(tokens, self.vocab_)    # convert to count vector

        # log likelihood: sum(count * log P(word|class)) over all words
        log_likelihood = (self.feature_log_prob_ * vec).sum(axis=1)

        # log prior: log(P(class))
        log_prior = np.log(self.class_priors_)

        # log posterior for each class
        return log_prior + log_likelihood

    def predict(self, texts):
        """
        Predict class labels for a list of texts.
        """
        preds = []
        for t in texts:
            log_post = self._log_posterior(t)          # compute log posteriors
            c_idx = np.argmax(log_post)                # choose class with highest score
            preds.append(self.classes_[c_idx])         # map index back to class label
        return np.array(preds)


**Small subset experiment:**

In [10]:
train_subset_size = 1000
test_subset_size = 300

X_train_small = X_train[:train_subset_size]
y_train_small = y_train[:train_subset_size]
X_test_small = X_test[:test_subset_size]
y_test_small = y_test[:test_subset_size]

vocab_small = build_vocabulary(X_train_small, min_freq=3)
print("Small vocab size:", len(vocab_small))

nb_small = MultinomialNBScratch(alpha=1.0)
nb_small.fit(X_train_small, y_train_small, vocab_small)

y_pred_small = nb_small.predict(X_test_small)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc_small = accuracy_score(y_test_small, y_pred_small)
print(f"Subset accuracy (scratch NB) = {acc_small:.4f}")

print("\nClassification report (subset):")
print(classification_report(y_test_small, y_pred_small))

cm_small = confusion_matrix(y_test_small, y_pred_small, labels=["fake", "true"])
print("Confusion matrix (rows=actual, cols=pred, order=[fake, true]):\n", cm_small)


Small vocab size: 9752
Subset accuracy (scratch NB) = 0.9533

Classification report (subset):
              precision    recall  f1-score   support

        fake       0.94      0.96      0.95       139
        true       0.97      0.94      0.96       161

    accuracy                           0.95       300
   macro avg       0.95      0.95      0.95       300
weighted avg       0.95      0.95      0.95       300

Confusion matrix (rows=actual, cols=pred, order=[fake, true]):
 [[134   5]
 [  9 152]]


**Full train experiment:**

In [11]:
vocab_full = build_vocabulary(X_train, min_freq=5)
print("Full vocab size:", len(vocab_full))

nb_full = MultinomialNBScratch(alpha=1.0)
nb_full.fit(X_train, y_train, vocab_full)

y_pred_test = nb_full.predict(X_test)
acc_test = accuracy_score(y_test, y_pred_test)

print(f"Test accuracy (scratch NB) = {acc_test:.4f}")
print("\nClassification report (full test):")
print(classification_report(y_test, y_pred_test))

cm_full = confusion_matrix(y_test, y_pred_test, labels=["fake", "true"])
print("Confusion matrix (rows=actual, cols=pred, order=[fake, true]):\n", cm_full)


Full vocab size: 35464
Test accuracy (scratch NB) = 0.9535

Classification report (full test):
              precision    recall  f1-score   support

        fake       0.95      0.95      0.95      2687
        true       0.96      0.96      0.96      3179

    accuracy                           0.95      5866
   macro avg       0.95      0.95      0.95      5866
weighted avg       0.95      0.95      0.95      5866

Confusion matrix (rows=actual, cols=pred, order=[fake, true]):
 [[2552  135]
 [ 138 3041]]
