In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
train_df = pd.read_csv('/content/drive/My Drive/LIN371/train_df.csv')
val_df = pd.read_csv('/content/drive/My Drive/LIN371/val_df.csv')
test_df = pd.read_csv('/content/drive/My Drive/LIN371/test_df.csv')

train_df.shape

(9662, 5)

In [None]:
docs_train, y_train, extra_train = train_df['body'], train_df['label'], train_df['new_label']
docs_val, y_val, extra_val = val_df['body'], val_df['label'], val_df['new_label']
docs_test, y_test, extra_test = test_df['body'], test_df['label'], test_df['new_label']

In [None]:
import torch

### --- To do: check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
### ------------------------------------

cuda


In [None]:
import random
import numpy as np
import torch

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
### --- To do: construct a Dataset class
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("vtexas/test_bert_model")

class Dataset(torch.utils.data.Dataset):
  def __init__(self, docs, ys):
    self.labels = ys
    self.texts = [
        tokenizer(
            text, padding='max_length', max_length=512, truncation=True, return_tensors='pt'
        ) for text in docs
    ]

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    return self.texts[idx]

  def __getitem__(self,idx):
    batch_texts = self.get_batch_texts(idx)
    batch_ys = self.get_batch_labels(idx)
    return batch_texts, batch_ys
### ------------------------------------

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import f1_score, classification_report

### --- To do: construct an evaluation function
def evaluate(model, eval_dataloader):
  total_acc = 0
  total_loss = 0

  criterion = torch.nn.CrossEntropyLoss()

  eval_labels, eval_preds = [], []
  with torch.no_grad():
    for eval_input, eval_label in eval_dataloader:
      input_ids = eval_input['input_ids'].squeeze().to(device)
      attention_mask = eval_input['attention_mask'].to(device)
      eval_label = eval_label.type(torch.LongTensor).to(device)

      output = model(input_ids, attention_mask)

      logits = output.logits


      batch_loss = criterion(logits, eval_label)
      total_loss += batch_loss.item()

      acc = (logits.argmax(dim=1) == eval_label).sum().item()
      total_acc += acc

      eval_labels.extend(eval_label.tolist())
      eval_preds.extend(logits.argmax(dim=1).tolist())

    f1_macro = f1_score(y_true=eval_labels, y_pred=eval_preds, average='macro')
    report = classification_report(y_true=eval_labels, y_pred=eval_preds)
  return total_acc, total_loss, f1_macro, report, eval_labels, eval_preds
### ------------------------------------

In [None]:
from tqdm import tqdm
from torch.optim import Adam

### --- To do: construct a training function
def train_bert(
    model,
    train_docs, train_ys,
    val_docs, val_ys,
    learning_rate = 2e-5,
    epochs = 1,
    batch_size=16):
  print('starting training...')
  print("====================")
  print(f"{'Epoch':^7} | {'val acc':^9} | {'val f1':^9}")
  print('-'*50)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  train, val = Dataset(train_docs, train_ys), Dataset(val_docs, val_ys)
  train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True, worker_init_fn=np.random.seed(0))
  val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=True, worker_init_fn=np.random.seed(0))

  criterion = torch.nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr=learning_rate)
  scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

  if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

  best_f1 = 0.0

  for epoch_num in range(epochs):
    for train_inputs, train_labels in tqdm(train_dataloader, leave=False):
      train_labels = train_labels.type(torch.LongTensor).to(device)
      input_ids = train_inputs['input_ids'].squeeze(1).to(device)
      attention_masks = train_inputs['attention_mask'].to(device)

      # training mode
      model.train()
      output = model(input_ids, attention_masks)

      # print('this is the output')
      # print(output)
      logits = output.logits

      batch_loss = criterion(logits, train_labels)
      optimizer.zero_grad()
      batch_loss.backward()
      optimizer.step()
    scheduler.step()

    model.eval()
    val_acc, val_Loss, val_f1, val_report, val_labels, val_preds = evaluate(model, val_dataloader)

    print(f'{epoch_num+1:^7} | {val_acc/len(val_docs):^9.3f} | {val_f1:^9.3f}')

    if val_f1 > best_f1:
      best_f1 = val_f1
      print(f"New best f1 {best_f1} found at epoch {epoch_num+1}. Saving model.")
      save_path = '/content/drive/My Drive/LIN371/transfer_learning/'
      model.save_pretrained(save_path)
      tokenizer.save_pretrained(save_path)
      torch.save(optimizer.state_dict(), "/content/drive/My Drive/LIN371/transfer_learning/optimizer_state.pt")
      torch.save(scheduler.state_dict(), "/content/drive/My Drive/LIN371/transfer_learning/scheduler_state.pt")
      print(f"Model and tokenizer saved to {save_path}")




### ------------------------------------

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("vtexas/test_bert_model", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("vtexas/test_bert_model")

train_bert(model, docs_train, y_train, docs_val, y_val, epochs=3)

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at vtexas/test_bert_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


starting training...
 Epoch  |  val acc  |  val f1  
--------------------------------------------------




   1    |   0.900   |   0.900  
New best f1 0.9003772544646638 found at epoch 1. Saving model.
Model and tokenizer saved to /content/drive/My Drive/LIN371/transfer_learning/




   2    |   0.896   |   0.896  




   3    |   0.891   |   0.891  


In [None]:
### --- To do: Using the LOADED BERT
set_seed(3)

loaded_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/My Drive/LIN371/transfer_learning/")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/LIN371/transfer_learning/")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

test = Dataset(docs_test, y_test)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=16)
loaded_model.eval()
total_acc, total_loss, f1_macro, report, eval_labels, eval_preds = evaluate(loaded_model, test_dataloader)
### ------------------------------------

In [None]:
print(total_acc / len(docs_test))
print(total_loss)
print(f1_macro)
print(report)

# 0.892982819292072
# 87.23913978599012
# 0.8929673917044711
#               precision    recall  f1-score   support

#            0       0.88      0.90      0.89      2416
#            1       0.90      0.88      0.89      2415

#     accuracy                           0.89      4831
#    macro avg       0.89      0.89      0.89      4831
# weighted avg       0.89      0.89      0.89      4831


0.8915338439246533
79.24822739697993
0.8915048310669491
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2416
           1       0.88      0.91      0.89      2415

    accuracy                           0.89      4831
   macro avg       0.89      0.89      0.89      4831
weighted avg       0.89      0.89      0.89      4831



In [None]:
matches = sum(a == b for a, b in zip(eval_labels, eval_preds))
proportion = matches / len(eval_labels)
print(proportion)

# 0.892982819292072

0.8915338439246533


In [None]:
pred_df = pd.DataFrame({'text': docs_test, 'label': y_test, 'new_label': extra_test, 'prediction': eval_preds})
pred_df.to_csv('/content/drive/My Drive/LIN371/predictions_mlm_tuned.csv')
pred_df.head()

Unnamed: 0,text,label,new_label,prediction
0,Those pussy lips need more cleaning with my to...,1,explicit_source_has_explicit_words,1
1,"I have choices for you. Choice seating, at that",1,explicit_source_no_explicit_words,1
2,I want to finish.,1,explicit_source_no_explicit_words,1
3,"Oh it Will, one way or an other 😉",1,explicit_source_no_explicit_words,1
4,"No need to thank me, thank you so much for sha...",1,explicit_source_no_explicit_words,1


In [None]:
# accuracy by new label
control_ne = pred_df[pred_df['new_label'] == 'control_source_no_explicit_words']
control_ne_acc = control_ne[control_ne['label'] == control_ne['prediction']].shape[0] / control_ne.shape[0]
print('Control source no explicit words accuracy:', control_ne_acc)

# bert: Control source no explicit words accuracy: 0.9189439555349699
# masked: Control source no explicit words accuracy: 0.8925428439092172


Control source no explicit words accuracy: 0.8925428439092172


In [None]:
control_he = pred_df[pred_df['new_label'] == 'control_source_has_explicit_words']
control_he_acc = control_he[control_he['label'] == control_he['prediction']].shape[0] / control_he.shape[0]
print('Control source has explicit words accuracy:', control_he_acc)

# bert: Control source has explicit words accuracy: 0.7859922178988327
# masked: Control source has explicit words accuracy: 0.7276264591439688


Control source has explicit words accuracy: 0.7276264591439688


In [None]:
explicit_ne = pred_df[pred_df['new_label'] == 'explicit_source_no_explicit_words']
explicit_ne_acc = explicit_ne[explicit_ne['label'] == explicit_ne['prediction']].shape[0] / explicit_ne.shape[0]
print('Explicit source no explicit words accuracy:', explicit_ne_acc)

# bert: Explicit source no explicit words accuracy: 0.8408812729498164
# masked: Explicit source no explicit words accuracy: 0.8745410036719706


Explicit source no explicit words accuracy: 0.8745410036719706


In [None]:
explicit_he = pred_df[pred_df['new_label'] == 'explicit_source_has_explicit_words']
explicit_he_acc = explicit_he[explicit_he['label'] == explicit_he['prediction']].shape[0] / explicit_he.shape[0]
print('Explicit source has explicit words accuracy:', explicit_he_acc)

# bert: Explicit source has explicit words accuracy: 0.9654289372599232
# masked: Explicit source has explicit words accuracy: 0.9782330345710627



Explicit source has explicit words accuracy: 0.9782330345710627
