Source: https://www.kaggle.com/code/abrafey/fake-news-transformers

# **Step 1: Importing the Relevant Libraries**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
from PIL import Image
import spacy
# import en_core_web_sm

import random
import warnings
import time
import datetime

import pandas as pd
import random, time
from babel.dates import format_date, format_datetime, format_time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score


import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup, \
LongformerTokenizerFast, LongformerForSequenceClassification, TextClassificationPipeline
from accelerate import Accelerator
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# **Step 2: Configurations**

In [None]:
#Setting seeds for consistent results.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available. {}'.format(torch.cuda.device_count()))
    print('We will use the GPU: {}'.format(torch.cuda.get_device_name(0)))

# If we dont have GPU but a CPU, training will take place on CPU instead
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# **Step 3: Data Inspection**

In [None]:
train = pd.read_csv('../data/kaggle/train.csv')
test = pd.read_csv('../data/kaggle/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.shape,test.shape

In [None]:
train.label.value_counts()

In [None]:
label = train["label"].value_counts()
sns.barplot(x=label.index, y=label)
plt.title('Target Count', fontsize=14)

In [None]:
#ratio of null values
train.isnull().sum()/train.shape[0] *100

In [None]:
seq_len_premise = [len(str(i).split()) for i in train['title']]

pd.Series(seq_len_premise).hist(bins = 25)

In [None]:
seq_len_premise = [len(str(i).split()) for i in train['text']]
pd.Series(seq_len_premise).hist(bins = 25)

# **Step 4: Data Cleaning**

In [None]:
print(train.isna().sum(),'\n')
print(train.dropna(how='all').isna().sum())

In [None]:
train.shape

In [None]:
train = train.fillna('')
test = test.fillna('')

In [None]:
def text_cleaning(text):
    text = str(text)
    text = re.sub("[^a-zA-Z]", " ", text) # removing punctuation
    # remove special characters from text column
    text = re.sub('[#,@,&]', '',text)
    # Remove digits
    text = re.sub('\d*','', text)
    #Remove www
    text = re.sub('w{3}','', text)
    # remove urls
    text = re.sub("http\S+", "", text)
    # remove multiple spaces with single space
    text = re.sub('\s+', ' ', text)
    #remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', '', text)

    return text

In [None]:
train['tokenized_text'] = train['text'].apply(text_cleaning)
train['tokenized_title'] = train['title'].apply(text_cleaning)

test['tokenized_text'] = test['text'].apply(text_cleaning)
test['tokenized_title'] = test['title'].apply(text_cleaning)

In [None]:
stop_words = stopwords.words('english')

train['tokenized_text'] = train['tokenized_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['tokenized_text'] = test['tokenized_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

train['tokenized_title'] = train['tokenized_title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['tokenized_title'] = test['tokenized_title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

## Save / load interim data

In [None]:
# Save interim data
train.to_csv('../data/kaggle/interim/train_cleaned.csv', index=False)
test.to_csv('../data/kaggle/interim/test_cleaned.csv', index=False)

In [None]:
# Load interim data
train = pd.read_csv('../data/kaggle/interim/train_cleaned.csv').fillna('')
test = pd.read_csv('../data/kaggle/interim/test_cleaned.csv').fillna('')

## Finalize train/test inputs

In [None]:
max_length=100

# labels
labels = train['label'].values

#case
train_data = (train['tokenized_title'] + ' ' + train['author'] + ' ' + train['tokenized_text']).values
test_data = (test['tokenized_title'] + ' ' + test['author'] + ' ' + test['tokenized_text']).values

# Full input for longformer
# train_data = (train['title'] + ' ' + train['author'] + ' ' + train['tokenized']).values
# test_data = (test['title'] + ' ' + test['author'] + ' ' + test['tokenized']).values

In [None]:
# To speed up tokenization + mapping phase, already truncate the text based on max_length * 1.5 (multiplied for some leeway)
train_data = np.array([' '.join(txt.split(' ')[:int(max_length*1.5)]) for txt in train_data])
test_data = np.array([' '.join(txt.split(' ')[:int(max_length*1.5)]) for txt in test_data])

# **Step 4: Initializing the model**

**4.1 Tokenization**

In [None]:
USE_LONGFORMER = False

In [None]:
if USE_LONGFORMER:
    tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', gradient_checkpointing=True)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
if USE_LONGFORMER:
    model = LongformerForSequenceClassification.from_pretrained(
    'allenai/longformer-base-4096', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False, 
    )
else:
    model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False,
    )

model.to(device)

In [None]:
print(' Original: ', train_data[0])
print('Tokenized: ', tokenizer.tokenize(train_data[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_data[0])))

since 651>512 , hence need to set max length

we can say max_length = *2200* (for longformers)

In [None]:
max_len = 0
for text in test_data:    
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)

In [None]:
train_data.shape, test_data.shape

In [None]:
def tokenize_map(sentence, max_length, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    global labels
    input_ids = []
    attention_masks = []
    for text in sentence:  
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            truncation='longest_first', # Activate and control truncation
                            max_length = max_length,           # Max length according to our text data.
                            pad_to_max_length = True, # Pad & truncate all sentences.
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )    
        input_ids.append(encoded_dict['input_ids'])     
        attention_masks.append(encoded_dict['attention_mask'])   
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)   
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    else:
        return input_ids, attention_masks

In [None]:
input_ids, attention_masks, labels = tokenize_map(train_data, max_length, labels)
test_input_ids, test_attention_masks = tokenize_map(test_data, max_length)

**4.2 Train Validation split**

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
train_size, val_size

**4.3 DataLoaders**

In [None]:
batch_size = int(32 / (max_length/50))
    
train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
)

In [None]:
prediction_data = TensorDataset(test_input_ids, test_attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

**4.4 Optimizer & Learning Rate Scheduler**

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 6e-6, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
                )

In [None]:
epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return accuracy_score(labels_flat, pred_flat)

def flat_f1(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    # print("pred_flat ", pred_flat)
    # print("labels_flat", labels_flat)
    
    # print("for 0: ",f1_score(labels_flat, pred_flat, pos_label=0))
    # print("for 1: ",f1_score(labels_flat, pred_flat, pos_label=1))
    return f1_score(labels_flat, pred_flat)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def format_time(elapsed):    
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# **Step 5: Training and Validation**

In [None]:
accelerator = Accelerator(gradient_accumulation_steps=2)

total_t0 = time.time()
for epoch_i in range(0, epochs):
    print('')
    print('Training...')
    print('----- Epoch {:} / {:} -----'.format(epoch_i + 1, epochs))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        with accelerator.accumulate(model):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0) 
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device).to(torch.int64)
            b_input_mask = batch[1].to(device).to(torch.int64)
            b_labels = batch[2].to(device).to(torch.int64)

            model.zero_grad()        

            loss = model(b_input_ids, 
                                       token_type_ids=None, 
                                       attention_mask=b_input_mask,
                                       labels=b_labels)[0]
            logits = model(b_input_ids, 
                                       token_type_ids=None, 
                                       attention_mask=b_input_mask,
                                       labels=b_labels)[1]

            total_train_loss += loss.item()
            accelerator.backward(loss)
    #         loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoch took: {:}'.format(training_time))

In [None]:
training_stats = []
validations_labels_ep = []
actual_labels_ep = []
actual = np.zeros(shape=(0,0))
total_t0 = time.time()
for epoch_i in range(0, epochs):
    print('')
    print('Validation...')

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    validations = []
    
    validations_labels = []
    actual_labels = []

    for step, batch in enumerate(validation_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
    
        with torch.no_grad():        
            loss = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]

            logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[1]
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        if(epoch_i == epochs-1):
          actual = np.append(actual, label_ids)
          validations.append(logits)

        # print("shape ",actual.flatten().shape)        

        if(step == len(validation_dataloader)-1 and epoch_i == epochs-1):
          flat_validations = [item for sublist in validations for item in sublist]
          flat_validations = np.argmax(flat_validations, axis=1).flatten()
          validations_labels.append(flat_validations)

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        total_eval_f1 += flat_f1(logits, label_ids)

    validations_labels_ep.append(validations_labels)
    actual_labels_ep.append(actual_labels)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.2f}'.format(avg_val_accuracy))

    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.2f}'.format(avg_val_f1))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print('  Validation Loss: {0:.2f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    training_stats.append(
        {
            'EPOCH': epoch_i + 1,
            'Train. Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Valid. F1' : avg_val_f1,
            'Train. Time': training_time,
            'Valid. Time': validation_time
        }
    )

print('')
print('Total training took {:} (h:mm:ss)'.format(format_time(time.time()-total_t0)))

# **Step 5: Evaluation**

In [None]:
pd.set_option("display.precision", 3)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('EPOCH')
display(df_stats)

In [None]:
orig = validations_labels_ep[1][0].tolist()
pred = [round(x) for x in actual.tolist()]

In [None]:
cm = confusion_matrix(orig, pred, labels=[0,1])

In [None]:
plot_confusion_matrix(cm, classes=['TRUE','FAKE'], title ='Confusion matrix')

In [None]:
class_based = []
class_based.append(
        {
            'class': 0,
            'f1_score': f1_score(orig, pred, pos_label=0),
            'precision_score': precision_score(orig, pred, pos_label=0),
            'recall_score': precision_score(orig, pred, pos_label=0),
        }
    )
class_based.append(
        {
            'class': 1,
            'f1_score': f1_score(orig, pred, pos_label=1),
            'precision_score': precision_score(orig, pred, pos_label=1),
            'recall_score': precision_score(orig, pred, pos_label=1),
        }
    )

In [None]:
pd.set_option("display.precision", 3)
df_stats = pd.DataFrame(data=class_based)
df_stats = df_stats.set_index('class')
display(df_stats)

**5.1 Test Data**

In [None]:
print('Predicting label len {}'.format(len(test_input_ids)))

model.eval()

predictions = [] 

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
        
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    
    predictions.append(logits)
    
predictions = np.argmax(np.concatenate(predictions, axis=0), axis=1)

In [None]:
# prediction_dataloader used SequentialSampler, so can simply predictions merge with the test dataset
# 1 is unreliable, 0 is reliable
test['pred'] = predictions

In [None]:
from transformers import TextClassificationPipeline

In [None]:
import pandas as pd
sanity_check = [("True", "Justice department finds more classified documents at Joe Biden’s home. The Guardian. New search turns up six more items from tenures as vice-president and in the Senate"),
     ("True", "Taliban bans contraception calling use a ‘western conspiracy’ The Guardian"),
     ("True", "Man arrested after plowing car into protesters at anti-immigration rally in Dublin InfoWars"),
     ("True", "Man (30s) arrested after anti-immigration protester struck by car Irish Times")]

In [None]:
mapping = {'LABEL_1': 'Fake', 'LABEL_0': 'True'}

In [None]:
# test a single prediction
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False, device=torch.cuda.current_device())
res = pipe([t[1] for t in sanity_check])

In [None]:
for (truth, text), pred in list(zip(sanity_check, res)):
    prediction = mapping[pred['label']]
    score = pred['score']
    print(f'Text: {text}\n\tLabel: {truth}\n\tPredict: {prediction}\n\tConf: {score}\n')