## Load Libraries and Dataset


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import AutoTokenizer
from transformers import TFBertModel

In [None]:
import pandas as pd

# Define a safe number of rows to read.
# We'll stop a little before the reported error row (14939) to be safe.
SAFE_ROWS_TO_READ = 14900

print(f"Attempting to load data by reading the first {SAFE_ROWS_TO_READ} rows...")

# Load the first file (Fake.csv) using a highly robust method,
# assuming it was the file with the previous errors, to ensure it loads.
try:
    # Use the Python engine and skip any previous known bad line (if any)
    fake_df = pd.read_csv("/content/Fake.csv", engine='python', error_bad_lines=False)
    print("Fake.csv loaded successfully (potentially skipping bad lines).")
except Exception as e:
    print(f"Failed to load Fake.csv even with robust settings: {e}")
    # Fallback to the known nrows fix if needed
    try:
        fake_df = pd.read_csv("/content/Fake.csv", nrows=3800)
        print("Fake.csv loaded successfully using nrows=3800 workaround.")
    except Exception:
        print("Failed to load Fake.csv.")

# Load the second file (True.csv) by stopping before the EOF error row
try:
    true_df = pd.read_csv("/content/True.csv", nrows=SAFE_ROWS_TO_READ)
    print(f"True.csv loaded successfully by reading only the first {SAFE_ROWS_TO_READ} rows.")

except Exception as e:
    # This might happen if the file is smaller than 14900 rows or has another issue
    print(f"Failed to load True.csv using nrows. Trying without nrows: {e}")
    try:
        true_df = pd.read_csv("/content/True.csv", engine='python', error_bad_lines=False)
        print("True.csv loaded successfully with robust engine.")
    except Exception as e_final:
        print(f"Final attempt failed for True.csv: {e_final}")

Attempting to load data by reading the first 14900 rows...
Failed to load Fake.csv even with robust settings: read_csv() got an unexpected keyword argument 'error_bad_lines'
Fake.csv loaded successfully using nrows=3800 workaround.
True.csv loaded successfully by reading only the first 14900 rows.


### Feature Engineering


In [None]:
# import pandas as pd

# # Load the datasets
# try:
#     fake_df = pd.read_csv("/content/Fake.csv")
#     true_df = pd.read_csv("/content/True.csv")
#     print("Files loaded successfully.")
# except FileNotFoundError as e:
#     print(f"Error: One of the files was not found. {e}")
#     # Handle error or exit

In [None]:
# Add a 'label' column to each DataFrame
fake_df['label'] = 1  # 1 for fake news
true_df['label'] = 0   # 0 for true news

In [None]:
fake_df.drop(columns=['date','subject'],inplace=True)
true_df.drop(columns=['date','subject'],inplace=True)

In [None]:
# Combine the two DataFrames
combined_df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the combined DataFrame randomly
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Combined dataset shape: {combined_df.shape}")
print("Sample of the combined and shuffled data:")
print(combined_df.head())

Combined dataset shape: (18700, 3)
Sample of the combined and shuffled data:
                                               title  \
0   Trump National Security Pick Monica Crowley’s...   
1  Merkel heads to EU-Africa summit with eye on m...   
2  After U.S. veto, U.N. General Assembly to meet...   
3  New York mayor criticized for proposed limits ...   
4  Trump meets insurers, promises catastrophic ye...   

                                                text  label  
0  Conservative columnist Monica Crowley is set f...      1  
1  BERLIN (Reuters) - German Chancellor Angela Me...      0  
2  UNITED NATIONS (Reuters) - The 193-member Unit...      0  
3  NEW YORK (Reuters) - New York City public defe...      0  
4  WASHINGTON (Reuters) - President Donald Trump ...      0  


## Model Training


In [None]:
# Feature Combination: Create 'full_text' column
combined_df['full_text'] = combined_df['title'] + ' ' + combined_df['text']

# Data Split: Separate features and targets
X = combined_df['full_text']
y = combined_df['label']

# Split into train and test sets (80% train, 20% test)
# Stratified split based on label with random_state=42
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Initialize Tokenizer for 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize training data
train_encodings = tokenizer(
    X_train.tolist(),
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Tokenize testing data
test_encodings = tokenizer(
    X_test.tolist(),
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)



## Save Model


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel

# Define a custom Keras Layer to wrap the TFBertModel
class BertWrapper(tf.keras.layers.Layer):
    def __init__(self, bert_model=None, **kwargs):
        super(BertWrapper, self).__init__(**kwargs)
        # bert_model can be None when loading from config
        self.bert = bert_model

    def call(self, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        # Pass training=False to keep BERT layers in inference mode during functional API construction
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, training=self.trainable)
        cls_embedding = bert_output.last_hidden_state[:, 0, :]
        return cls_embedding

    def get_config(self):
        config = super(BertWrapper, self).get_config()
        # Don't serialize bert_model, it will be loaded separately
        return config

    @classmethod
    def from_config(cls, config):
        # Load BERT model when reconstructing from config
        bert_model = TFBertModel.from_pretrained('bert-base-uncased', use_safetensors=False)
        return cls(bert_model=bert_model, **config)

# --- 1. Load the Base BERT Encoder ---
# Load the pre-trained BERT model (only the encoder/feature extractor)
bert_base_model = TFBertModel.from_pretrained('bert-base-uncased', use_safetensors=False)

# Instantiate the wrapper
bert_encoder = BertWrapper(bert_base_model)

# --- 2. Define Inputs ---
# Define the two required Input layers for BERT
# The shape must match the max_length (256) used in tokenization
input_ids = Input(shape=(256,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(256,), dtype=tf.int32, name='attention_mask')

# Create a dictionary to feed the inputs into the BERT wrapper layer
model_inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

# --- 3. Pass Inputs Through BERT Wrapper ---
# Pass the inputs through the custom BERT wrapper layer
pooled_output = bert_encoder(model_inputs)

# --- 4. Build Custom Classification Head ---
# Add a Dropout layer for regularization
x = Dropout(0.2, name='dropout_layer')(pooled_output)

# Add the final Dense layer for classification
# Output shape is 1, and 'sigmoid' activation for binary classification
output = Dense(1, activation='sigmoid', name='output_layer')(x)

# --- 5. Create the Final Keras Model Object ---
model = Model(inputs=[input_ids, attention_mask], outputs=output)
print("BERT Functional Model successfully defined.")


# --- 6. Model Compilation ---
# Define the Adam optimizer with a low learning rate for fine-tuning
optimizer = Adam(learning_rate=3e-5)

# Compile the model
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',  # Standard loss for binary classification
    metrics=['accuracy']
)

# Display the model summary (optional, but helpful)
model.summary()


# --- 7. Model Training ---
# Fit the model to the training data

# Extract the tensors from the tokenized objects
X_train_inputs = {'input_ids': train_encodings['input_ids'],
                  'attention_mask': train_encodings['attention_mask']}

history = model.fit(
    X_train_inputs,
    y_train,
    validation_data=(
        {'input_ids': test_encodings['input_ids'],
         'attention_mask': test_encodings['attention_mask']},
        y_test
    ),
    batch_size=16, # Batch size should be a power of 2 (16, 32, 64)
    epochs=3       # Typically 2-4 epochs are enough for fine-tuning BERT
)

# --- 8. Save the Model ---
# Use SavedModel format which handles custom objects better
model.save('bert_fakenews_model.keras')
print("\nModel trained and saved as 'bert_fakenews_model' (SavedModel format)")

tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training

BERT Functional Model successfully defined.


Epoch 1/3
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 377ms/step - accuracy: 0.7975 - loss: 0.4764 - val_accuracy: 0.8203 - val_loss: 0.3660
Epoch 2/3
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 343ms/step - accuracy: 0.8405 - loss: 0.3556 - val_accuracy: 0.8861 - val_loss: 0.2915
Epoch 3/3
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 343ms/step - accuracy: 0.8848 - loss: 0.2935 - val_accuracy: 0.9182 - val_loss: 0.2403

Model trained and saved as 'bert_fakenews_model' (SavedModel format)


## Test the model

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFBertModel

# -----------------------------
# 1. CUSTOM BERT WRAPPER
# (MUST MATCH TRAINING)
# -----------------------------
class BertWrapper(tf.keras.layers.Layer):
    def __init__(self, bert_model=None, **kwargs):
        super().__init__(**kwargs)
        self.bert = bert_model

    def call(self, inputs):
        outputs = self.bert(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            training=False  # inference mode
        )
        # CLS token
        return outputs.last_hidden_state[:, 0, :]

    def get_config(self):
        return super().get_config()

    @classmethod
    def from_config(cls, config):
        bert_model = TFBertModel.from_pretrained(
            'bert-base-uncased',
            use_safetensors=False
        )
        return cls(bert_model=bert_model, **config)


# -----------------------------
# 2. LOAD TOKENIZER
# -----------------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


# -----------------------------
# 3. LOAD SAVED MODEL
# -----------------------------
print("Loading trained model...")
model = tf.keras.models.load_model(
    "bert_fakenews_model.keras",
    custom_objects={
        "BertWrapper": BertWrapper,
        "TFBertModel": TFBertModel
    }
)

print("Model loaded successfully!\n")
model.summary()


# -----------------------------
# 4. PREDICTION FUNCTION
# -----------------------------
def predict_news(text):
    encodings = tokenizer(
        text,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )

    inputs = {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"]
    }

    prediction = model.predict(inputs, verbose=0)
    prob_fake = float(prediction[0][0])

    label = "FAKE" if prob_fake >= 0.5 else "REAL"
    return prob_fake, label


# -----------------------------
# 5. TEST WITH SAMPLE TEXTS
# -----------------------------
test_samples = [
    "Breaking: Aliens landed in Mumbai last night",
    "The Prime Minister addressed the nation today",
    "Drinking hot water cures all diseases, doctors say",
    "The stock market closed higher after RBI announcement"
]

print("---- TEST RESULTS ----")
for text in test_samples:
    prob, label = predict_news(text)
    print(f"{label} ({prob:.4f})  ->  {text}")


Loading tokenizer...
Loading trained model...


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model loaded successfully!



---- TEST RESULTS ----
REAL (0.4614)  ->  Breaking: Aliens landed in Mumbai last night
REAL (0.2521)  ->  The Prime Minister addressed the nation today
FAKE (0.5046)  ->  Drinking hot water cures all diseases, doctors say
REAL (0.1012)  ->  The stock market closed higher after RBI announcement


In [None]:
from google.colab import files
files.download("bert_fakenews_model.keras")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import sys
print(sys.version)


3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]


In [None]:
pip show tensorflow


Name: tensorflow
Version: 2.19.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, requests, setuptools, six, tensorboard, termcolor, typing-extensions, wrapt
Required-by: dopamine_rl, tensorflow-text, tensorflow_decision_forests, tf_keras
