In [14]:
# %% [code]
# Mount your Google Drive to access the CSV files
from google.colab import drive
drive.mount('/content/drive')

# Define file paths (adjust based on your Drive folder structure)
true_path = '/content/drive/My Drive/data/True.csv'
fake_path = '/content/drive/My Drive/data/Fake.csv'

import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load datasets; assume that each file has a column named "text" containing the article content.
true_df = pd.read_csv(true_path)
fake_df = pd.read_csv(fake_path)

# Label data: 1 for true news, 0 for fake news.
true_df['label'] = 1
fake_df['label'] = 0

# Combine datasets into one DataFrame.
data = pd.concat([true_df, fake_df], ignore_index=True)


Mounted at /content/drive


# **2. Data Cleaning and OOV Words Analysis**
First, clean the text and then analyze OOV words using a reference vocabulary (here we use Keras’s Tokenizer with an OOV token):

In [15]:
# %% [code]
# Define a text cleaning function.
def clean_text(text):
    text = re.sub(r'http\S+', '', text)       # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)     # Remove punctuation and numbers
    text = text.lower().strip()                # Convert to lowercase and trim spaces
    return text

# Apply cleaning to the 'text' column.
data['clean_text'] = data['text'].apply(clean_text)

# Split data into training and test sets.
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
print(f"Training samples: {len(train_df)}, Testing samples: {len(test_df)}")


Training samples: 35918, Testing samples: 8980


Now, tokenize the text and perform OOV words analysis:

In [16]:
# %% [code]
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000  # Limit vocabulary size.
max_len = 200      # Maximum sequence length.

# Initialize Tokenizer with an OOV token.
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['clean_text'].values)

# Convert texts to sequences.
train_sequences = tokenizer.texts_to_sequences(train_df['clean_text'].values)
test_sequences = tokenizer.texts_to_sequences(test_df['clean_text'].values)

# Pad sequences to uniform length.
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

# Analyze OOV words:
oov_index = tokenizer.word_index.get('<OOV>')
print("OOV token index:", oov_index)

# Count occurrences of OOV tokens in training data.
oov_count = sum(seq.count(oov_index) for seq in train_sequences)
total_tokens = sum(len(seq) for seq in train_sequences)
print(f"Total OOV occurrences: {oov_count}")
print(f"Percentage of OOV tokens: {oov_count/total_tokens*100:.2f}%")


OOV token index: 1
Total OOV occurrences: 890627
Percentage of OOV tokens: 6.21%


Hybrid Model (BERT‑Whiting + LSTM + Transformer)

2. BERT Tokenization

In [None]:
# %% [code]
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='tf'
    )

train_encodings = encode_texts(train_df['clean_text'].values, bert_tokenizer, max_len=128)
test_encodings = encode_texts(test_df['clean_text'].values, bert_tokenizer, max_len=128)


# **3. Hybrid Model: BERT‑Whiting + LSTM + Transformer**
This model integrates BERT’s contextual embeddings, an LSTM layer, and then a Transformer (Multi-Head Attention) layer.

In [11]:
import tensorflow as tf
from tensorflow.keras import layers
from transformers import TFBertModel

# Load the base BERT model from Hugging Face.
bert_base = TFBertModel.from_pretrained('bert-base-uncased')

# Define model inputs.
input_ids = layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask_input = layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Define a function to call BERT, ensuring the attention mask is cast to int32.
def call_bert(inputs):
    input_ids, attention_mask = inputs
    attention_mask = tf.cast(attention_mask, tf.int32)
    bert_outputs = bert_base(input_ids, attention_mask=attention_mask)[0]
    return bert_outputs

# Apply the Lambda layer with an explicit output_shape.
bert_outputs = layers.Lambda(
    call_bert,
    output_shape=lambda input_shapes: (input_shapes[0][0], input_shapes[0][1], 768)
)([input_ids, attention_mask_input])

# Pass the BERT embeddings through a Bidirectional LSTM layer.
lstm_out = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(bert_outputs)

# Add a Transformer (MultiHeadAttention) layer.
transformer_out = layers.MultiHeadAttention(num_heads=4, key_dim=64)(lstm_out, lstm_out)
# Add a residual connection and layer normalization.
transformer_out = layers.LayerNormalization(epsilon=1e-6)(transformer_out + lstm_out)

# Global pooling and dense layers for final classification.
pooling = layers.GlobalAveragePooling1D()(transformer_out)
dense = layers.Dense(64, activation='relu')(pooling)
output = layers.Dense(1, activation='sigmoid')(dense)

# Build and compile the hybrid model.
hybrid_model = tf.keras.Model(inputs=[input_ids, attention_mask_input], outputs=output)
hybrid_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
hybrid_model.summary()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Prepare Your Data for the Hybrid Model

In [None]:
# Example: Using the previously defined encode_texts function.
train_encodings = encode_texts(train_df['clean_text'].values, bert_tokenizer, max_len=128)
test_encodings = encode_texts(test_df['clean_text'].values, bert_tokenizer, max_len=128)

y_train = train_df['label'].values
y_test = test_df['label'].values


Train the Hybrid Model

In [None]:
history = hybrid_model.fit(
    x={
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask']
    },
    y=y_train,
    validation_data=(
        {
            'input_ids': test_encodings['input_ids'],
            'attention_mask': test_encodings['attention_mask']
        },
        y_test
    ),
    epochs=2,   # Adjust number of epochs as needed
    batch_size=16
)


Epoch 1/2
[1m2245/2245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22479s[0m 10s/step - accuracy: 0.8861 - loss: 0.2791 - val_accuracy: 0.9820 - val_loss: 0.0505
Epoch 2/2
[1m2245/2245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.9882 - loss: 0.0354