# Project: Finetune Base DistilBert Model to Classify Covid Rumors

# 1.  Import Data

In [None]:
# data from https://github.com/MickeysClubhouse/COVID-19-rumor-dataset

import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')

df = pd.read_csv('/content/drive/My Drive/Data/data_sentiment.csv', header=None)
df = df.iloc(axis=1)[0:4]
df.columns = ['i', 'label_chr', 'text', 'label_int']
print(df.shape)
df.head()

mask = (df.label_chr == 'T') | (df.label_chr == 'F') | (df.label_chr == 'U')
df = df[mask]
print(df.label_chr.value_counts())

Mounted at /content/drive/
(6149, 4)
F    3461
T    1434
U    1253
Name: label_chr, dtype: int64


# 2.  Train Test Split

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    df.index.values,
    df.label_chr.values,
    test_size = 0.15, 
    random_state=17,
    stratify=df.label_chr.values,
)

df['data_type'] = ['not_set'] * df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_test, 'data_type'] = 'test'
df.groupby(['label_chr', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,i,text,label_int
label_chr,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,test,520,520,520
F,train,2941,2941,2941
T,test,215,215,215
T,train,1219,1219,1219
U,test,188,188,188
U,train,1065,1065,1065


# 3.  Tokenize Text and set up datasets/dataloaders

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

encoded_text = tokenizer(df.text.iloc[0])
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(df.text.iloc[0])
print(tokens)

print(type(df[df.data_type == 'train'].text.values.tolist()))

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The lie that coronavirus came from a bat or a pangolin is concocted by the Chinese state with the tacit support of the U.S. deep state and its friends in the European Union, Russia and Australia, and spread by the docile media in all of those countries
['[CLS]', 'the', 'lie', 'that', 'corona', '##virus', 'came', 'from', 'a', 'bat', 'or', 'a', 'pang', '##olin', 'is', 'con', '##co', '##cted', 'by', 'the', 'chinese', 'state', 'with', 'the', 'ta', '##cit', 'support', 'of', 'the', 'u', '.', 's', '.', 'deep', 'state', 'and', 'its', 'friends', 'in', 'the', 'european', 'union', ',', 'russia', 'and', 'australia', ',', 'and', 'spread', 'by', 'the', 'doc', '##ile', 'media', 'in', 'all', 'of', 'those', 'countries', '[SEP]']
<class 'list'>


In [None]:
import torch

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt',
    truncation=True
)
encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type == 'test'].text.values.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt',
    truncation=True
)

input_ids_train = encoded_data_train['input_ids']
attention_mask_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(pd.get_dummies(df[df.data_type == 'train'].label_chr.values).to_numpy()).float() # in order, F T U

input_ids_test = encoded_data_test['input_ids']
attention_mask_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(pd.get_dummies(df[df.data_type == 'test'].label_chr.values).to_numpy()).float()   # in order, F T U

pd.get_dummies(df[df.data_type == 'test'].label_chr.values).columns == pd.get_dummies(df[df.data_type == 'train'].label_chr.values).columns

cols = pd.get_dummies(df[df.data_type == 'test'].label_chr.values).columns

In [None]:
from torch.utils.data import DataLoader, TensorDataset

dataset_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)
print(len(dataset_train))
print(len(dataset_test))

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False)
print(len(dataloader_train))
print(len(dataloader_test))

5225
923
164
29


# 4. Import DistilBert Base Model

In [None]:
from transformers import AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model = AutoModel.from_pretrained('distilbert-base-uncased').to(device)

cuda


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 5. Classification Head

In [None]:
import torch.nn as nn
import torch

class classifier_head(nn.Module):
  def __init__(self, model, input_dims, output_dims, freeze):
    super(classifier_head, self).__init__()
    self.model = model
    self.classification_head = nn.Linear(input_dims, output_dims)
    self.freeze=freeze

  def forward(self, input_ids, attention_mask):
      if self.freeze:
          with torch.no_grad():
              hidden = self.model(input_ids, attention_mask)
      else:
        hidden = self.model(input_ids, attention_mask)
      #final = torch.softmax(torch.sigmoid(self.classification_head(hidden['last_hidden_state'][:,0])), 1)
      final = self.classification_head(hidden['last_hidden_state'][:,0])
      return final


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def freeze_parameters(model, layer_name):
    for name, param in model.named_parameters():
        #print(f"name: {name}")
        if layer_name not in name:
            param.requires_grad = False


# 6. Perf Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

def evaluate(dataloader_val, my_classifier, criterion):
    my_classifier.eval()
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                 }
        labels = batch[2]
        with torch.no_grad():        
            outputs = my_classifier(**inputs)
            
        loss = criterion(outputs, labels)
        loss_val_total += loss.item()
        
        predictions.append(outputs.cpu().numpy())
        true_vals.append(labels.cpu().numpy())
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    


def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = np.argmax(labels, axis=1).flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels, names):
    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = np.argmax(labels, axis=1).flatten()
    accuracy = {}
    for i in np.unique(labels_flat):
        yhat = preds_flat[labels_flat == i]
        y = labels_flat[labels_flat == i]
        accuracy[i] = np.sum(y == yhat) / y.shape[0]
        print(f'class: {i}')
        print(f'class name: {names[i]}')
        print(f'accuracy: {np.sum(y == yhat)} / {y.shape[0]} ')
        print(f'accuracy: {accuracy[i]}')
        print()
      
    return accuracy



# 7. Training loop

In [None]:
def train(epochs, my_classifier, optimizer, criterion, device, dataloader_train, 
          dataloader_test, chkpt_filename='DistillBert_Covid_Misinfo_frozen'):
  for epoch in tqdm(range(1, epochs+1)):
    my_classifier.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc=f'epoch: {epoch}', leave=False, disable=False)
    for batch in progress_bar:
      my_classifier.zero_grad()

      batch = tuple(b.to(device) for b in batch)
      inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
      }
      labels=batch[2]

      outputs = my_classifier(**inputs)
      loss = criterion(outputs, labels)
      loss_train_total += loss.item()
      loss.backward()
      optimizer.step()
      
      progress_bar.set_postfix(
          {
              'training_loss': '{:.3f}'.format(loss.item()/len(batch))
          }
      )

    torch.save(my_classifier.state_dict(), f'/content/drive/My Drive/Models/{chkpt_filename}_{epoch}.chkpt')

    tqdm.write(f'\nEpoch: {epoch}')
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'\Avg Training Loss: {loss_train_avg}')

    test_loss, predictions, true_vals = evaluate(dataloader_test, my_classifier, criterion)
    test_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Test loss: {test_loss}')
    tqdm.write(f'Test weighted F1: {test_f1}')


# 8. Training for frozen base classifier


In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
import random
from tqdm.notebook import tqdm
import numpy as np

myseed = 42
random.seed(myseed)
np.random.seed(myseed)
torch.manual_seed(myseed)
torch.cuda.manual_seed_all(myseed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

my_classifier_frozen_base = classifier_head(model, 768, 3, True).to(device)
print(f'Differentiable parameters before freezing: {count_parameters(my_classifier_frozen_base)}')
freeze_parameters(my_classifier_frozen_base, 'classification_head') # freezes everything but this layer
print(f'Differentiable parameters after freezing: {count_parameters(my_classifier_frozen_base)}')

my_iter = iter(dataloader_train)
batch = next(my_iter)
inputs = {
    'input_ids': batch[0].to(device),
    'attention_mask': batch[1].to(device)
}
print(f'check input shape: {batch[0].shape}')
y_hat = my_classifier_frozen_base(**inputs)
print(f'check output shape: {y_hat.shape}')

optimizer = AdamW(
    my_classifier_frozen_base.parameters(),
    lr=1e-4,
    eps=1e-8,
)
epochs=15
criterion = nn.CrossEntropyLoss()

train(epochs, my_classifier_frozen_base, optimizer, criterion, device, dataloader_train, 
      dataloader_test, chkpt_filename='DistillBert_Covid_Misinfo_frozen')

print("\n\n")
_, predictions, true_vals = evaluate(dataloader_test, my_classifier_frozen_base, criterion)
accuracy_per_class(predictions, true_vals, cols)

cuda
Differentiable parameters before freezing: 66365187
Differentiable parameters after freezing: 2307
check input shape: torch.Size([32, 84])
check output shape: torch.Size([32, 3])


  0%|          | 0/15 [00:00<?, ?it/s]

epoch: 1:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 1
\Avg Training Loss: 1.006382052491351
Test loss: 0.9430552885450166
Test weighted F1: 0.40603984265956095


epoch: 2:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 2
\Avg Training Loss: 0.9145725438507591
Test loss: 0.8893791879045552
Test weighted F1: 0.4349655022229798


epoch: 3:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 3
\Avg Training Loss: 0.8663062042579418
Test loss: 0.8496218950584017
Test weighted F1: 0.5046029753427494


epoch: 4:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 4
\Avg Training Loss: 0.829579637544911
Test loss: 0.8195395233302281
Test weighted F1: 0.5611689493868522


epoch: 5:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 5
\Avg Training Loss: 0.8025407307758564
Test loss: 0.7964080109678465
Test weighted F1: 0.5818399965319535


epoch: 6:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 6
\Avg Training Loss: 0.7778913840288069
Test loss: 0.7792429163538176
Test weighted F1: 0.6008638758243436


epoch: 7:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 7
\Avg Training Loss: 0.7607855796813965
Test loss: 0.7649267839974371
Test weighted F1: 0.607739010131912


epoch: 8:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 8
\Avg Training Loss: 0.7483366248084278
Test loss: 0.7543104630092095
Test weighted F1: 0.617420044748254


epoch: 9:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 9
\Avg Training Loss: 0.737586680345419
Test loss: 0.7462208949286362
Test weighted F1: 0.6335498590920952


epoch: 10:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 10
\Avg Training Loss: 0.7302543241076354
Test loss: 0.73791127266555
Test weighted F1: 0.6418944960938886


epoch: 11:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 11
\Avg Training Loss: 0.7251891479259585
Test loss: 0.7324638212549275
Test weighted F1: 0.6505882591170814


epoch: 12:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 12
\Avg Training Loss: 0.7179995255499352
Test loss: 0.7265296342044041
Test weighted F1: 0.6589040876559089


epoch: 13:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 13
\Avg Training Loss: 0.7113045283207079
Test loss: 0.7230243847287935
Test weighted F1: 0.6590015613084356


epoch: 14:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 14
\Avg Training Loss: 0.705655520827305
Test loss: 0.716723425634976
Test weighted F1: 0.670463071576355


epoch: 15:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 15
\Avg Training Loss: 0.6963585262618414
Test loss: 0.7150229595858475
Test weighted F1: 0.6667277744264448



class: 0
class name: F
accuracy: 470 / 520 
accuracy: 0.9038461538461539

class: 1
class name: T
accuracy: 116 / 215 
accuracy: 0.5395348837209303

class: 2
class name: U
accuracy: 51 / 188 
accuracy: 0.2712765957446808



{0: 0.9038461538461539, 1: 0.5395348837209303, 2: 0.2712765957446808}

# 9. Fine-tuning the full classifier

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
import random
from tqdm.notebook import tqdm
import numpy as np

myseed = 42
random.seed(myseed)
np.random.seed(myseed)
torch.manual_seed(myseed)
torch.cuda.manual_seed_all(myseed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

model = AutoModel.from_pretrained('distilbert-base-uncased').to(device)

my_classifier_unfrozen_base = classifier_head(model, 768, 3, False).to(device)
print(f'Differentiable parameters before freezing: {count_parameters(my_classifier_unfrozen_base)}')
print(f'Differentiable parameters after freezing: {count_parameters(my_classifier_unfrozen_base)}')

my_iter = iter(dataloader_train)
batch = next(my_iter)
inputs = {
    'input_ids': batch[0].to(device),
    'attention_mask': batch[1].to(device)
}
print(f'check input shape: {batch[0].shape}')
y_hat = my_classifier_unfrozen_base(**inputs)
print(f'check output shape: {y_hat.shape}')

optimizer = AdamW(
    my_classifier_unfrozen_base.parameters(),
    lr=1e-4,
    eps=1e-8,
)
epochs=7
criterion = nn.CrossEntropyLoss()

train(epochs, my_classifier_unfrozen_base, optimizer, criterion, device, dataloader_train, 
      dataloader_test, chkpt_filename='DistillBert_Covid_Misinfo_unfrozen')

_, predictions, true_vals = evaluate(dataloader_test, my_classifier_unfrozen_base, criterion)
accuracy_per_class(predictions, true_vals, cols)

cuda


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Differentiable parameters before freezing: 66365187
Differentiable parameters after freezing: 66365187
check input shape: torch.Size([32, 84])
check output shape: torch.Size([32, 3])


  0%|          | 0/7 [00:00<?, ?it/s]

epoch: 1:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 1
\Avg Training Loss: 0.6856921480559721
Test loss: 0.5966558574602522
Test weighted F1: 0.7161602749752337


epoch: 2:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 2
\Avg Training Loss: 0.41741460698043426
Test loss: 0.6600215568624693
Test weighted F1: 0.7313513801011762


epoch: 3:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 3
\Avg Training Loss: 0.2135298192591929
Test loss: 0.96118414556158
Test weighted F1: 0.7267537185053567


epoch: 4:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 4
\Avg Training Loss: 0.1442716796342965
Test loss: 0.941767198772266
Test weighted F1: 0.7469175453163923


epoch: 5:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 5
\Avg Training Loss: 0.07768416110846418
Test loss: 1.1125773119515385
Test weighted F1: 0.7181356700563172


epoch: 6:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 6
\Avg Training Loss: 0.04717387735102575
Test loss: 1.3742864378567399
Test weighted F1: 0.7221975268768352


epoch: 7:   0%|          | 0/164 [00:00<?, ?it/s]


Epoch: 7
\Avg Training Loss: 0.035301660536078555
Test loss: 1.4117814731495133
Test weighted F1: 0.7078351560263844
class: 0
class name: F
accuracy: 440 / 520 
accuracy: 0.8461538461538461

class: 1
class name: T
accuracy: 161 / 215 
accuracy: 0.7488372093023256

class: 2
class name: U
accuracy: 59 / 188 
accuracy: 0.31382978723404253



{0: 0.8461538461538461, 1: 0.7488372093023256, 2: 0.31382978723404253}