### Import the required libraries

In [None]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as f

import seaborn as sns
import matplotlib.pyplot as plt

from torch import optim
from wordcloud import WordCloud

import torch.nn as nn

import re
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from transformers import BertForSequenceClassification, BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, BertModel

### Get the data and remove the duplicates and NaNs

In [None]:
ls | grep csv

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.info()

In [None]:
def combine_columns(row): 
    return str(row['text']) + ' ' + str(row['title'])

In [None]:
train_df['text_title'] = train_df.apply(combine_columns, axis=1)

In [None]:
train_df.dropna(subset=['text_title'], inplace=True)
train_df.drop_duplicates(subset='text_title', inplace=True)

In [None]:
train_df.info()

In [None]:
train_df.drop(columns=['title', 'author', 'text'], inplace=True)

In [None]:
train_df.head(3)

In [None]:
id2label = {
    0: 'real', 
    1: 'fake'
}
label2id = {val: key for key, val in id2label.items()}

In [None]:
sns.countplot(data=train_df, x='label')

### Number of Characters in the Text

In [None]:
fig, axs = plt.subplots(figsize=(15, 8), ncols=2)
text_lens = train_df[train_df['label'] == 1]['text_title'].apply(lambda x: len(x.split()))
sns.histplot(text_lens, ax=axs[0])
axs[0].set_title('Number of characters in the news that are labeled as fake')
text_lens = train_df[train_df['label'] == 0]['text_title'].apply(lambda x: len(x.split()))
sns.histplot(text_lens, ax=axs[1])
axs[1].set_title('Number of characters in the news that are labeled as real')

### Word Cloud

In [None]:
fake_text = ' '.join(train_df[train_df['label'] == 1]['text_title'].tolist())
wc = WordCloud()
wc_obj = wc.generate(fake_text)
plt.figure(figsize=(10, 8))
plt.imshow(wc_obj)
plt.axis('off')
plt.show()

In [None]:
real_text = ' '.join(train_df[train_df['label'] == 0]['text_title'].tolist())
wc = WordCloud()
wc_obj = wc.generate(real_text)
plt.figure(figsize=(10, 8))
plt.imshow(wc_obj)
plt.axis('off')
plt.show()

### Loading the dataset

In [None]:
def text_cleaning(text: str) -> str:
    text = re.sub("[^a-zA-Z]", " ", text) # removing punctuation
    return text

In [None]:
train_df['text'] = train_df['text_title'].apply(text_cleaning)

In [None]:
train_split, val_split = train_test_split(train_df, stratify=train_df['label'], test_size=.2)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, tokenizer: BertTokenizer, split):
        self.texts = split['text_title'].tolist()
        self.labels = split['label'].tolist()
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):
        text = self.texts[index]
        encoded_text = tokenizer(
            text = text, 
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        return encoded_text['input_ids'], encoded_text['attention_mask'], self.labels[index]
    def __len__(self):
        return len(self.texts)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_ds = NewsDataset(tokenizer, train_split)
val_ds = NewsDataset(tokenizer, val_split)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)

In [None]:
for input_ids, attention_mask, labels in train_dl:
    print(input_ids.shape, attention_mask.shape, labels.shape)
    break

### Model

In [None]:
device = th.device('cuda') if th.cuda.is_available() else th.device('cpu')

In [None]:
class FakeNewsClassifier(nn.Module):
    def __init__(self):
        super(FakeNewsClassifier, self).__init__()
        
        D_in, H, D_out = 768, 50, 2

        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        
        last_hidden_state_cls = outputs.last_hidden_state[:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)
        
        return logits

In [None]:
sample_model = FakeNewsClassifier()
for input_ids, attention_mask, labels in train_dl:
    print(input_ids.shape, attention_mask.shape, labels.shape)
    input_ids = input_ids.squeeze(1)
    attention_mask = attention_mask.squeeze(1)
    logits = sample_model(input_ids, attention_mask)
    print(logits)
    break

In [None]:
def save_checkpoint(model, save_path, valid_loss):
    if save_path == None:
        print('Path to save the checkpoint is not valid!')
        return

    state_dict = {
        'model_state_dict': model.state_dict, 
        'valid_loss': valid_loss
    }
    
    th.save(state_dict, save_path)
    print('Model saved to ==> {}'.format(save_path))

In [None]:
def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = th.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader, epochs=5, evaluation=False):
    """Train the BertClassifier model.
    """

    val_accuracy_list = []
    train_accuracy_list = []
    val_loss_list = []
    train_loss_list = []

    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):

        train_epoch_accuracy_list = []
        train_epoch_loss_list = []
        
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            b_input_ids = b_input_ids.squeeze(1)
            b_attn_mask = b_attn_mask.squeeze(1)

            model.zero_grad()


            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).flatten()

            accuracy = (preds == b_labels).cpu().numpy().mean() * 100
            train_epoch_accuracy_list.append(accuracy)
            train_epoch_loss_list.append(loss.item())


            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        train_accuracy_list.append(np.mean(train_epoch_accuracy_list))
        train_loss_list.append(np.mean(train_epoch_loss_list))

        print("-"*70)

        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            val_accuracy_list.append(val_accuracy)
            val_loss_list.append(val_loss)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")
    return train_accuracy_list, train_loss_list, val_accuracy_list, val_loss_list


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        b_input_ids = b_input_ids.squeeze(1)
        b_attn_mask = b_attn_mask.squeeze(1)
        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=5):
    model = FakeNewsClassifier()
    model.to(device)

    optimizer = AdamW(model.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )


    total_steps = len(train_dl) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return model, optimizer, scheduler

In [None]:
set_seed(42)    # Set seed for reproducibility
model, optimizer, scheduler = initialize_model(epochs=5)
train_accuracy_list, train_loss_list, val_accuracy_list, val_loss_list = train(model, train_dl, val_dl, epochs=5, evaluation=True)