In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd ./drive/MyDrive/git_hub_repo/NLP_tasks/Project/

/content/drive/MyDrive/git_hub_repo/NLP_tasks/Project


## 0. Set up

In [3]:
! pip install transformers datasets evaluate scikit-learn
! pip install accelerate -U



In [4]:
import torch
import pandas as pd
from transformers import AutoTokenizer
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import numpy as np
import json

import re
import string

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## 1. Load dataset
I will use the HC3 (Human ChatGPT Comparison Corpus) dataset from the 🤗 Datasets library.

In [6]:
ds = json.load(open('QQA_train.json'))

In [7]:
len(ds)

564

In [8]:
ds[0].keys()

dict_keys(['question', 'Option1', 'Option2', 'answer', 'type', 'question_sci_10E', 'question_char', 'question_sci_10E_char', 'question_mask'])

In [9]:
ds[5]['question']

'The mammoth moved at a speed of 18 km per hour through the watering hole, but moved at a speed of 18 miles per hour on the hard dirt because the hard dirt had? '

In [10]:
ds[5]['Option1']

'less resistance'

In [11]:
ds[5]['answer']

'Option 1'

## 2. Data Preprocessing and Save
The next step is to load a tokenizer to preprocess the text field. A tokenizer converts text to a sequence of tokens and creates numerical representation.

In [52]:
# tokenizer
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# tokenizer test
text = "Hello everyone 12!"
vocab = {v:k for k,v in roberta_tokenizer.vocab.items()}
print([(id, vocab[id]) for id in roberta_tokenizer(text)['input_ids']])
print([(id, vocab[id]) for id in roberta_tokenizer(ds[5]['question'])['input_ids']])

[(0, '<s>'), (31414, 'Hello'), (961, 'Ġeveryone'), (316, 'Ġ12'), (328, '!'), (2, '</s>')]
[(0, '<s>'), (133, 'The'), (28521, 'Ġmammoth'), (1410, 'Ġmoved'), (23, 'Ġat'), (10, 'Ġa'), (2078, 'Ġspeed'), (9, 'Ġof'), (504, 'Ġ18'), (6301, 'Ġkm'), (228, 'Ġper'), (1946, 'Ġhour'), (149, 'Ġthrough'), (5, 'Ġthe'), (30221, 'Ġwatering'), (4683, 'Ġhole'), (6, ','), (53, 'Ġbut'), (1410, 'Ġmoved'), (23, 'Ġat'), (10, 'Ġa'), (2078, 'Ġspeed'), (9, 'Ġof'), (504, 'Ġ18'), (1788, 'Ġmiles'), (228, 'Ġper'), (1946, 'Ġhour'), (15, 'Ġon'), (5, 'Ġthe'), (543, 'Ġhard'), (10667, 'Ġdirt'), (142, 'Ġbecause'), (5, 'Ġthe'), (543, 'Ġhard'), (10667, 'Ġdirt'), (56, 'Ġhad'), (116, '?'), (1437, 'Ġ'), (2, '</s>')]


In [13]:
#Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [14]:
# making dataframe include human answers
question_list = []
option1_list = []
option2_list = []
input_ids_list = []
attention_mask_list = []
label_list = []

for sample in ds:

  # some of the question does not have answer.
  question_list.append(sample['question_char']+'[Option1]:'+sample['Option1']+'[Option2]:'+sample['Option2'])
  option1_list.append(sample['Option1'])
  option2_list.append(sample['Option2'])
  input_ids_list.append(roberta_tokenizer(sample['question_char']+'[Option1]:'+sample['Option1']+'[Option2]:'+sample['Option2'], truncation=True)['input_ids'])
  attention_mask_list.append(roberta_tokenizer(sample['question_char']+'[Option1]:'+sample['Option1']+'[Option2]:'+sample['Option2'], truncation=True)['attention_mask'])

  if sample['answer'] == 'Option 1':

    label_list.append(0)

  elif sample['answer'] == 'Option 2':

    label_list.append(1)

tokenized_df = pd.DataFrame.from_dict({'question':question_list, 'option1':option1_list, 'option2':option2_list, \
                                                      'input_ids':input_ids_list, 'attention_mask':attention_mask_list, \
                                                      'label':label_list})

In [15]:
tokenized_df.iloc[0]['question']

'The ranger and the rustler both were riding horses that galloped at the same speed.  The rustler left at 0 1:0 0 where as the ranger left at 050 0 hours. Who has traveled further?? [Option1]:the ranger[Option2]:the rustler'

In [49]:
# shows clean texts
tokenized_df.iloc[5]['question']

'The mammoth moved at a speed of 1 8 km per hour through the watering hole, but moved at a speed of 1 8 miles per hour on the hard dirt because the hard dirt had? [Option1]:less resistance[Option2]:more resistance'

In [17]:
def add_token_padding(sample):

  if len(sample)<512:

    token_paddings = [0 for i in range(512-len(sample))]
    return sample + token_paddings

  else:
    return sample

def add_mask_padding(sample):

  if len(sample)<512:

    mask_paddings = [1 for i in range(512-len(sample))]
    return sample + mask_paddings

  else:
    return sample

In [18]:
tokenized_df['input_ids'] = tokenized_df['input_ids'].map(lambda x: add_token_padding(x))
tokenized_df['attention_mask'] = tokenized_df['attention_mask'].map(lambda x: add_mask_padding(x))

In [19]:
print(tokenized_df.shape)

(564, 6)


In [20]:
tokenized_df.to_pickle('./tokenized_df.pkl')

## 3. Load preprocessed data

In [21]:
import torch
from transformers import AutoTokenizer
from datasets import Dataset
import time

device = "cuda:0" if torch.cuda.is_available() else "cpu"
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [22]:
import pandas as pd
loaded_tokenized_df = pd.read_pickle('./tokenized_df.pkl')

In [23]:
loaded_tokenized_df = loaded_tokenized_df.drop(columns=['question', 'option1', 'option2'])

In [24]:
# convert dataframe to dataset
tokenized_dataset = Dataset.from_pandas(loaded_tokenized_df)

In [25]:
# tokenized_dataset = tokenized_dataset.remove_columns(["answer"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

## 4. Split Data set to Train set, Evaluation set, and Test set

In [26]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding, set_seed

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from tqdm.auto import tqdm
from transformers import get_scheduler
from torch.optim import AdamW
import math

import numpy as np

seed = 42

In [27]:
# sample some data for experiments

tokenized_dataset_train_eval = tokenized_dataset.train_test_split(test_size=0.1, seed=seed)

In [28]:
train_set = tokenized_dataset_train_eval['train']
train_set

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 507
})

In [29]:
eval_set = tokenized_dataset_train_eval['test']
eval_set

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 57
})

## 5. Experiments: training and evaluating the model

In [30]:
id2label = {0: "Option1", 1: "Option2"}
label2id = {"Option1": 0, "Option2": 1}
set_seed(seed)


# This automodel class gives us the model with pretrained weights + a sequence classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2, id2label=id2label, label2id=label2id, hidden_dropout_prob=0.1,
    attention_probs_dropout_prob = 0.1, num_hidden_layers=12, num_attention_heads = 12
    ).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# hyperparameters
model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Option1",
    "1": "Option2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Option1": 0,
    "Option2": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [32]:
# arguments and objects required for trainer
training_args = TrainingArguments(
    num_train_epochs=20,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate = 2e-5
)

data_collator = DataCollatorWithPadding(tokenizer=roberta_tokenizer)

In [33]:
from sklearn.metrics import confusion_matrix

class CustomTrainer(Trainer):


  def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, \
        ignore_keys_for_eval=None):


    number_of_epochs = args.num_train_epochs

    train_loss = []
    train_acc = []
    eval_loss = []
    eval_acc = []

    times_per_epoch = []
    times_per_inference = []



    criterion = torch.nn.CrossEntropyLoss().to(device)
    self.optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma = 0.9)

    train_dataloader = self.get_train_dataloader()
    eval_dataloader = self.get_eval_dataloader()


    max_steps = math.ceil(len(train_dataloader) * args.num_train_epochs)



    for epoch in range(number_of_epochs):


      self.model.train()
      self.model.zero_grad()


      train_loss_per_epoch = 0
      train_acc_per_epoch = 0

      with tqdm(train_dataloader, unit="batch") as training_epoch:
        training_epoch.set_description(f"Training Epoch {epoch}")

        starttime_epoch = time.time()
        for step, inputs in enumerate(training_epoch):

            inputs = inputs.to(device)
            labels = inputs['labels']

            self.optimizer.zero_grad()

            start_inference = time.time()
            output = model(**inputs)
            end_inference = time.time()

            times_per_inference.append(end_inference-start_inference)


            loss = criterion(output.logits, labels)
            train_loss_per_epoch+=loss.item()

            loss.backward()

            self.optimizer.step()
            train_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()

        endtime_epoch = time.time()

        times_per_epoch.append(endtime_epoch-starttime_epoch)

        self.scheduler.step()

        train_loss_per_epoch /= len(train_dataloader)
        train_acc_per_epoch /= (len(train_dataloader)*batch_size)





        eval_loss_per_epoch = 0
        eval_acc_per_epoch = 0


        with tqdm(eval_dataloader, unit="batch") as eval_epoch:
          eval_epoch.set_description(f"Evaluation Epoch {epoch}")

          for step, inputs in enumerate(eval_epoch):


            inputs = inputs.to(device)
            labels = inputs['labels']



            output = model(**inputs)
            loss = criterion(output.logits, labels)
            eval_loss_per_epoch+=loss.item()

            loss.backward()

            eval_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()



        eval_loss_per_epoch /= len(eval_dataloader)
        eval_acc_per_epoch /= (len(eval_dataloader)*batch_size)

        print(f'\tTrain Loss: {train_loss_per_epoch} |  Train Acc: {train_acc_per_epoch*100.0}%')
        print(f'\tEval Loss: {eval_loss_per_epoch} |  eval Acc: {eval_acc_per_epoch*100.0}%')

        train_loss.append(train_loss_per_epoch)
        train_acc.append(train_acc_per_epoch)
        eval_loss.append(eval_loss_per_epoch)
        eval_acc.append(eval_acc_per_epoch)

        model.save_pretrained(f'./model_epoch_{epoch}')


    return train_loss, train_acc, eval_loss, eval_acc, times_per_epoch, times_per_inference



In [34]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=eval_set,
    data_collator = data_collator)

In [35]:
train_loss, train_acc, eval_loss, eval_acc, times_per_epoch, times_per_inference = trainer.train()

  0%|          | 0/43 [00:00<?, ?batch/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/5 [00:00<?, ?batch/s]

	Train Loss: 0.6991451562837113 |  Train Acc: 45.15503875968992%
	Eval Loss: 0.7014956712722779 |  eval Acc: 38.333333333333336%


  0%|          | 0/43 [00:00<?, ?batch/s]

  0%|          | 0/5 [00:00<?, ?batch/s]

	Train Loss: 0.6991144086039344 |  Train Acc: 47.86821705426357%
	Eval Loss: 0.6933867931365967 |  eval Acc: 43.333333333333336%


  0%|          | 0/43 [00:00<?, ?batch/s]

  0%|          | 0/5 [00:00<?, ?batch/s]

	Train Loss: 0.6957328693811283 |  Train Acc: 49.031007751937985%
	Eval Loss: 0.6964026689529419 |  eval Acc: 43.333333333333336%


  0%|          | 0/43 [00:00<?, ?batch/s]

  0%|          | 0/5 [00:00<?, ?batch/s]

	Train Loss: 0.6929864911145942 |  Train Acc: 48.643410852713174%
	Eval Loss: 0.6917713165283204 |  eval Acc: 53.333333333333336%


  0%|          | 0/43 [00:00<?, ?batch/s]

  0%|          | 0/5 [00:00<?, ?batch/s]

	Train Loss: 0.6987102655477302 |  Train Acc: 50.3875968992248%
	Eval Loss: 0.6893224954605103 |  eval Acc: 51.66666666666667%


  0%|          | 0/43 [00:00<?, ?batch/s]

  0%|          | 0/5 [00:00<?, ?batch/s]

	Train Loss: 0.6992393759794013 |  Train Acc: 46.31782945736434%
	Eval Loss: 0.7050711154937744 |  eval Acc: 43.333333333333336%


  0%|          | 0/43 [00:00<?, ?batch/s]

KeyboardInterrupt: ignored

In [None]:
print(f'Average training time per epoch: {round(np.mean(times_per_epoch), 3)} seconds', \
      f'\nAverage inference time: {round(np.mean(times_per_inference), 3)} seconds')


In [None]:
import matplotlib.pyplot as plt

plt.xlabel("runs")
plt.ylabel("normalised measure of loss/accuracy")
x_len=list(range(len(eval_acc)))
plt.axis([0, max(x_len), 0, 1])
plt.title('result of the model')
loss=np.asarray(train_loss)/max(train_loss)
plt.plot(x_len, train_loss, 'r',label="train_loss")
plt.plot(x_len, train_acc, 'b', label="train_accuracy")
plt.plot(x_len, eval_loss, 'y', label="val_loss")
plt.plot(x_len, eval_acc, 'g', label="val_accuracy")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.2)
plt.show

In [None]:
best_num_epoch = eval_acc.index(max(eval_acc))
best_num_epoch

In [None]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

loaded_model = AutoModelForSequenceClassification. \
from_pretrained(f'./model_epoch_{9}')

In [None]:
from datasets import load_metric

metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer_2 = CustomTrainer(
    model=loaded_model,
    eval_dataset=test_set,
    compute_metrics = compute_metrics)

In [None]:
trainer_2.evaluate(test_set)

In [None]:
test_set_df = pd.DataFrame(test_set)

test_result = trainer_2.predict(test_set)

In [None]:
from scipy.special import logit, expit

def make_predictions(sample):
  return expit(sample).argmax()

def make_confi_scores(sample):
  return expit(sample)


predictions = list(map(make_predictions, test_result[0].tolist()))
confi_scores = list(map(make_confi_scores, test_result[0].tolist()))

In [None]:
test_set_df['predictions'] = predictions

test_set_df['labels'] = test_set_df['labels'].map(lambda x: int(x))

test_set_df['conf_score'] = confi_scores

test_set_df['input_ids'] = test_set_df['input_ids'].map(lambda x: x.tolist())

wrong_cases = test_set_df[test_set_df['predictions']!=test_set_df['labels']].drop(columns = ['attention_mask'])

In [None]:
wrong_cases

In [None]:
wrong_cases.to_csv('wrong_cases.csv')