In [None]:
# Cell 4: Label encoding and train-test split
from sklearn.model_selection import train_test_split

# Map labels to 0/1
label_mapping = {'FAKE': 0, 'REAL': 1}
df['label_num'] = df['label'].map(label_mapping)

X = df['clean_text'].values
y = df['label_num'].values

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

len(X_train_text), len(X_test_text)

In [None]:
# Cell 5: TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # unigrams + bigrams

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf  = tfidf.transform(X_test_text)

X_train_tfidf.shape, X_test_tfidf.shape

In [None]:
# Cell 6: Install gensim (only first time in Colab) and tokenize
!pip install gensim

import gensim
from gensim.models import Word2Vec

# Tokenize clean text into lists of words
df['tokens'] = df['clean_text'].apply(lambda x: x.split())

sentences = df['tokens'].tolist()
sentences[:2]

In [None]:
# Cell 7: Train Word2Vec
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,   # embedding size
    window=5,
    min_count=2,       # ignore very rare words
    workers=4,
    sg=1               # 1 = skip-gram, 0 = CBOW
)

w2v_model.wv.key_to_index.__len__()  # vocabulary size

In [None]:
# Cell 8: Create document vectors by averaging word vectors
import numpy as np

def document_vector(tokens, model):
    # filter tokens that exist in the model
    tokens = [w for w in tokens if w in model.wv]
    if len(tokens) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

doc_vectors = np.array([document_vector(tokens, w2v_model) for tokens in df['tokens']])
doc_vectors.shape

In [None]:
# Cell 9: Train-test split using Word2Vec features
X_w2v = doc_vectors
y_w2v = df['label_num'].values

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v, y_w2v, test_size=0.2, random_state=42, stratify=y_w2v
)

X_train_w2v.shape, X_test_w2v.shape
