# Teacher fine-tuning

Required Packages

In [2]:
%%time
%%capture

# Install required packages

!pip install transformers
# !pip install datasets
# !pip install fairseq
!pip install sentencepiece

CPU times: user 57.4 ms, sys: 23.6 ms, total: 80.9 ms
Wall time: 26.2 s


## Initialization

In [3]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.utils import shuffle

from numpy.lib.function_base import average

from tqdm.notebook import tqdm

from collections import Counter

import os
import re
import json
import copy
import collections
import time
import pickle

from transformers import BertConfig, BertTokenizer, BertweetTokenizer, RobertaTokenizer, AlbertTokenizer, DistilBertTokenizer, XLMRobertaTokenizer, XLNetTokenizer, T5Tokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification, XLMRobertaForSequenceClassification, XLNetForSequenceClassification, T5Model
from transformers import TrainingArguments
from transformers import Trainer
# from fairseq.models.roberta import XLMRModel

In [4]:
tqdm.pandas()

## Train, evaluation, and test sets

### Configurations

In [78]:
TRAIN_PATH = '../input/testinput-1/train.tsv' 
TEST_PATH = '../input/testinput-1/test.tsv'

CONTENT_HEADER = 'sentence'
LABEL_HEADER = 'label'
INDEX_HEADER = 'idx'

# general config
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 1
EVERY_EPOCH = 500
LEARNING_RATE = 5e-5

MODELS = ['bert-base-uncased', 'bert-large-uncased', 
          'roberta-base', 'roberta-large', f"cardiffnlp/twitter-roberta-base-sentiment",
          'xlm-roberta-large',
          'xlnet-base-cased', 'xlnet-large-cased',
          't5-base', 't5-large',
          'gpt2-medium']
MODEL_NAMES = ['bert-base-uncased', 'bert-large-uncased', 
          'roberta-base', 'roberta-large', "cardiffnlp-twitter-roberta-base-sentiment",
          'xlm-roberta-large',
          'xlnet-base-cased', 'xlnet-large-cased',
          't5-base', 'tf-large',
          'gpt2-medium']

MODEL_INDEX = 0
OUTPUT_PATH = '/model' + MODEL_NAMES[MODEL_INDEX] + '.bin'
MODEL_PATH =  '/model' + MODEL_NAMES[MODEL_INDEX] + '.pkl'


EVAL_FILE = 'evaluations.csv'
MODEL_RESULTS_FILE = 'model_results.csv'
PREDICTION_FILE = 'model_predictions.tsv'


### Using huggingface datasets

<mark> One of the 'HUGGINGFACE DATASETS' or 'LOCAL DATASETS' should be run

In [48]:
from datasets import load_dataset
train_dataset = load_dataset('glue', 'cola', split='train')
eval_dataset = load_dataset('glue', 'cola', split='validation')
test_dataset = load_dataset('glue', 'cola', split='test')


train = pd.DataFrame(train_dataset)
eval = pd.DataFrame(eval_dataset)
test = pd.DataFrame(test_dataset).filter([CONTENT_HEADER, INDEX_HEADER])

### Using local files

In [None]:
train = pd.read_csv(TRAIN_PATH,sep='\t')
train[LABEL_HEADER] = train[LABEL_HEADER].progress_apply(lambda l: int(l))


eval = train.sample(n=5000,random_state=42)
train = train.drop(eval.index)
train = train.sample(n=30000,random_state=42)

train.head()

In [None]:
eval.head()

In [None]:
test = pd.read_csv(TEST_PATH,sep='\t')
test.head()

### Split data and labels

In [49]:
x_train, y_train = train[CONTENT_HEADER].values.tolist(), train[LABEL_HEADER].values.tolist()
x_eval, y_eval = eval[CONTENT_HEADER].values.tolist(), eval[LABEL_HEADER].values.tolist()
x_test = test[CONTENT_HEADER].values.tolist()
y_test = [1]* len(x_test)

### Showing distributions

In [50]:
train_keys = list(Counter(y_train).keys())
train_values = list(Counter(y_train).values()) # counts the elements' frequency

train_val_dict = {}
for key, value in zip(train_keys, train_values):
    train_val_dict[key] = value

train_distribution = {'class_1': [train_val_dict[1]], 'class_0': [train_val_dict[0]]}
train_distribution = pd.DataFrame(train_distribution)

train_distribution

Unnamed: 0,class_1,class_0
0,6023,2528


In [51]:
eval_keys = list(Counter(y_eval).keys())
eval_values = list(Counter(y_eval).values()) # counts the elements' frequency

eval_val_dict = {}
for key, value in zip(eval_keys, eval_values):
    eval_val_dict[key] = value

eval_distribution = {'class_1': [eval_val_dict[1]], 'class_0': [eval_val_dict[0]]}
eval_distribution = pd.DataFrame(eval_distribution)

eval_distribution

Unnamed: 0,class_1,class_0
0,721,322


## Configuration values

In [14]:
model = BertForSequenceClassification.from_pretrained(MODELS[MODEL_INDEX], num_labels=2)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODELS[MODEL_INDEX])

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

## Creating dataset

In [52]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Tokenization

In [53]:
train_encodings = tokenizer(x_train, padding=True, truncation=True, max_length=MAX_LEN)

In [18]:
# train_encodings.keys()
# train_encodings['input_ids'][10]

In [54]:
eval_encodings = tokenizer(x_eval, padding=True, truncation=True, max_length=MAX_LEN)

In [55]:
# start_time = time.time()

test_encodings = tokenizer(x_test, padding=True, truncation=True, max_length=MAX_LEN)

# end_time = time.time()
# print(end_time - start_time)

In [56]:
train_dataset = TrainDataset(train_encodings, y_train)

In [57]:
eval_dataset = TrainDataset(eval_encodings, y_eval)

In [60]:
test_dataset = TrainDataset(test_encodings, y_test)

## Fine tuning

In [24]:
# training_args = TrainingArguments("test_trainer")
training_args = TrainingArguments(
      output_dir='output' + MODEL_NAMES[MODEL_INDEX],
      evaluation_strategy="epoch",
      eval_steps=EVERY_EPOCH,
      per_device_train_batch_size=TRAIN_BATCH_SIZE,
      per_device_eval_batch_size=VALID_BATCH_SIZE,
      num_train_epochs=EPOCHS,
      learning_rate=LEARNING_RATE
  )

In [25]:
eval_df = pd.DataFrame()
try:
    eval_df = pd.read_csv(EVAL_FILE)
except:  
    eval_df = pd.DataFrame(y_eval,columns=['real_val'])
    eval_df.to_csv(EVAL_FILE, index=False)

In [63]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(labels, pred, average='weighted')

#     eval_df[MODEL_NAMES[MODEL_INDEX]] = pred
#     eval_df.to_csv(EVAL_FILE, index=False)

#     matrix = confusion_matrix(y_true=labels, y_pred=pred)
#     matrix = matrix.diagonal()/matrix.sum(axis=1)
#     matrix = matrix.tolist()

    return {"accuracy": accuracy, "f1_score":f1, "precision": precision} #, "recall": recall, 'matrix': matrix}

In [64]:
trainer = Trainer(
      model=model, 
      args=training_args, 
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=compute_metrics,
  )

In [65]:
training_metrics = trainer.train()

***** Running training *****
  Num examples = 8551
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 535
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score,Precision
1,0.2576,0.507047,0.830297,0.822118,0.830297


Saving model checkpoint to outputbert-base-uncased/checkpoint-500
Configuration saved in outputbert-base-uncased/checkpoint-500/config.json
Model weights saved in outputbert-base-uncased/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




In [66]:
evaluate_metrics = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1043
  Batch size = 32


In [67]:
training_metrics

TrainOutput(global_step=535, training_loss=0.26962775292797625, metrics={'train_runtime': 62.5128, 'train_samples_per_second': 136.788, 'train_steps_per_second': 8.558, 'total_flos': 206530359015660.0, 'train_loss': 0.26962775292797625, 'epoch': 1.0})

## Evaluation

In [68]:
evaluate_metrics

{'eval_loss': 0.507047176361084,
 'eval_accuracy': 0.8302972195589645,
 'eval_f1_score': 0.8221181782742489,
 'eval_precision': 0.8302972195589645,
 'eval_runtime': 1.1827,
 'eval_samples_per_second': 881.868,
 'eval_steps_per_second': 27.902,
 'epoch': 1.0}

### Saving results in models.csv

In [None]:
train_loss = training_metrics.metrics['train_loss']
(eval_accuracy, eval_f1, eval_loss) = (evaluate_metrics['eval_accuracy'], evaluate_metrics['eval_f1_score'], evaluate_metrics['eval_loss'])

result_metrics = {'type': [MODEL_NAMES[MODEL_INDEX]],
                'train_loss': ["{:.2f}".format(train_loss)], 
                'eval_loss': ["{:.2f}".format(eval_loss)], 
                'eval_accuracy': ["{:.2f}".format(eval_accuracy*100)], 
                'eval_f1': ["{:.2f}".format(eval_f1*100)],
                'embedding_len': [MAX_LEN], 
                'learning_rate': [LEARNING_RATE], 
                'batch_size': [TRAIN_BATCH_SIZE]}
result_metrics = pd.DataFrame(result_metrics)
result_metrics

In [None]:
model_df = pd.DataFrame()

try:
    model_df = pd.read_csv(MODEL_RESULTS_FILE)
    model_df = model_df.append(result_metrics)
    model_df.to_csv(MODEL_RESULTS_FILE, index=False)
  # model_df = model_df.iloc[0:0]   #to clear model file
except:
    model_df = model_df.append(result_metrics)
    model_df.to_csv(MODEL_RESULTS_FILE, index=False)

In [None]:
model_df = pd.read_csv(MODEL_RESULTS_FILE)
model_df.head(50)

## Prediction

In [69]:
test_dataset

<__main__.TrainDataset at 0x7f27bcc4c350>

In [70]:
predict_metrics = trainer.predict(test_dataset=test_dataset)

***** Running Prediction *****
  Num examples = 1063
  Batch size = 32


In [71]:
predictions = predict_metrics[0]
results = np.argmax(predictions,axis=1)

keys = Counter(results).keys() # equals to list(set(results))
values = Counter(results).values() # counts the elements' frequency
print(keys)
print(values)

dict_keys([1, 0])
dict_values([861, 202])


In [76]:
results

array([1, 1, 1, ..., 0, 1, 1])

In [81]:
test[LABEL_HEADER] = results
prediction_df = test.filter(['idx', 'label'])

In [82]:
prediction_df.to_csv(PREDICTION_FILE, index=False)