<a href="https://colab.research.google.com/github/tekliyetamiru/Fake-News-Detections-By-Using-DistilBERT-MobileBERT-and-TinyBERT-Model/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Fine Tuning DistilBERT, MobileBERT and TinyBERT for Fake News Detection**

In [None]:
# !pip install -U transformers
# !pip install -U accelerate
# !pip install -U datasets
# !pip install -U bertviz
# !pip install -U Umap-learn
# !pip install seaborn --upgrade

# !pip install -U openpyxl

# Don't do in production. Doing now to keep output clean for understanding
import warnings
warnings.filterwarnings('ignore')


## **Data Loading**

In [None]:
import pandas as pd

datafile = pd.read_excel("https://github.com/tekliyetamiru/Fake-News-Detections-By-Using-DistilBERT-MobileBERT-and-TinyBERT-Model/raw/main/fake_news.xlsx")
datafile.head()

In [None]:
datafile.isnull().sum()

In [None]:
datafile = datafile.dropna()
datafile.isnull().sum()

In [None]:
datafile['label'].value_counts()

## **Dataset Analysis**

In [None]:
import matplotlib.pyplot as plt


In [None]:
label_counts = datafile['label'].value_counts(ascending=True)
label_counts.plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
# 1.5 tokens per word on average

datafile['title_tokens'] = datafile['title'].apply(lambda x: len(x.split())*1.5)
datafile['text_tokens'] = datafile['text'].apply(lambda x: len(x.split())*1.5)

fig, ax = plt.subplots(1,2, figsize=(15,5))

ax[0].hist(datafile['title_tokens'], bins=50, color="skyblue")
ax[0].set_title("Title Tokens")

ax[1].hist(datafile['text_tokens'], bins=50, color="orange")
ax[1].set_title("Text Tokens")


### **Split Dataset into train and test**

In [None]:
from sklearn.model_selection import train_test_split

# 70% for traing, 20% for test and 10% for validation
train, test = train_test_split(datafile, test_size = 0.3, stratify=datafile['label'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['label'])

datafile.shape,train.shape, test.shape, validation.shape

In [None]:
from datasets import Dataset, DatasetDict

dataset= DatasetDict({
    "train":Dataset.from_pandas(train, preserve_index=False),
    "test": Dataset.from_pandas(test, preserve_index=False),
    "validation":Dataset.from_pandas(validation, preserve_index=False)
})

In [None]:
dataset

### **Data Tokenization**

In [None]:
from transformers import AutoTokenizer
text = "machine learning is awesome!! Thanks KG take."

model_ckpt = "distilbert-base-uncased"
distilbert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
distilbert_tokens = distilbert_tokenizer.tokenize(text)

model_ckpt = "google/mobilebert-uncased"
mobilebert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
mobilebert_tokens = mobilebert_tokenizer.tokenize(text)

model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
tinybert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tinybert_tokens = tinybert_tokenizer.tokenize(text)

In [None]:
distilbert_tokenizer, mobilebert_tokenizer, tinybert_tokenizer

In [None]:
def tokenize(batch):
  temp = distilbert_tokenizer(batch['title'],padding=True, truncation=True)
  return temp

print(tokenize(dataset['train'][:2]))

In [None]:
encoded_dataset = dataset.map(tokenize, batch_size=None, batched=True)

## **Model Building**

In [None]:
from transformers import AutoModelForSequenceClassification,AutoConfig
import torch

label2id = {"Real":0,"Fake":1}
id2label = {0:"Real",1:"Fake"}


model_ckpt = "distilbert-base-uncased"
# model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
# model_ckpt = "google/mobilebert-uncased"

num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(model_ckpt,label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,config=config).to(device)

In [None]:
model.config

## **Model Training**

In [None]:
!pip install evaluate

In [None]:
# Build compute metrics function
# !pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics_evaluate(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions,references=labels)

In [None]:
from transformers import TrainingArguments

batch_size = 32
training_dir = "train_dir"

training_args = TrainingArguments(
    output_dir=training_dir,
    overwrite_output_dir=True,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy='epoch',
    disable_tqdm=False
)

In [None]:
training_args

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics_evaluate,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=distilbert_tokenizer
)

In [None]:
trainer.train()

## **Model Evaluation**

In [None]:
preds_output=trainer.predict(encoded_dataset['test'])

In [None]:
preds_output.metrics

In [None]:
y_pred = np.argmax(preds_output.predictions,axis=1)
y_true = encoded_dataset['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred, target_names=list(label2id)))

## **Benchmarking**

In [None]:
# use sklearn to build compute metrics
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)

  f1 = f1_score(labels,preds,average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy":acc, "f1":f1}

In [None]:

model_dict={
    "bert-base":"bert-base-uncased",
    "distilbert":"distilbert-base-uncased",
    "mobilebert":"google/mobilebert-uncased",
    "tinybert":"huawei-noah/TinyBERT_General_4L_312D"
}

def train_model(model_name):
  model_ckpt=model_dict[model_name]
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
  config = AutoConfig.from_pretrained(model_ckpt,label2id=label2id, id2label=id2label)
  model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,config=config).to(device)

  def local_tokenizer(batch):
    temp = tokenizer(batch['title'],padding=True,truncation=True)
    return temp

  encoded_dataset=dataset.map(local_tokenizer,batched=True,batch_size=None)

  trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer
    )
  trainer.train()

  preds = trainer.predict(encoded_dataset['test'])

  return preds.metrics

import time
model_performance={}
for model_name in model_dict:
  print("\n\n")
  print("Training Model: ", model_name)

  start = time.time()
  result = train_model(model_name)
  end = time.time()
  model_performance[model_name] = {model_name:result,"time taken":end-start}

In [None]:
model_performance