# Teacher fine-tuning

Required Packages

In [1]:
%%time
%%capture

# Install required packages

!pip install transformers
!pip install -U sentence-transformers
# !pip install datasets
# !pip install fairseq
!pip install sentencepiece

CPU times: user 78 ms, sys: 26.4 ms, total: 104 ms
Wall time: 33.7 s


## Initialization

In [2]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.utils import shuffle

from numpy.lib.function_base import average

from tqdm.notebook import tqdm

from collections import Counter

import os
import re
import json
import copy
import collections
import time
import pickle

from transformers import BertConfig, BertTokenizer, BertweetTokenizer, RobertaTokenizer, AlbertTokenizer, DistilBertTokenizer, XLMRobertaTokenizer, XLNetTokenizer, T5Tokenizer
from transformers import BertModel, AutoModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification, XLMRobertaForSequenceClassification, XLNetForSequenceClassification, T5Model
from transformers import TrainingArguments
from transformers import Trainer
from sentence_transformers import SentenceTransformer
# from fairseq.models.roberta import XLMRModel

from torch.nn import MSELoss

import datasets
from datasets import load_dataset

from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [3]:
tqdm.pandas()

## Train, evaluation, and test sets

### Configurations

In [4]:
TRAIN_PATH = '../input/testinput-1/train.tsv' 
TEST_PATH = '../input/testinput-1/test.tsv'

CONTENT_HEADER = 'sentence'
CONTENT_HEADERS = ['sentence1', 'sentence2']
LABEL_HEADER = 'label'
INDEX_HEADER = 'idx'

# general config
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 1
EVERY_EPOCH = 500
LEARNING_RATE = 5e-5

MODELS = ['bert-base-uncased', 'bert-large-uncased', 
          'roberta-base', 'roberta-large', f"cardiffnlp/twitter-roberta-base-sentiment",
          'xlm-roberta-large',
          'xlnet-base-cased', 'xlnet-large-cased',
          't5-base', 't5-large',
          'gpt2-medium',
          'sentence-transformers/stsb-roberta-base']
MODEL_NAMES = ['bert-base-uncased', 'bert-large-uncased', 
          'roberta-base', 'roberta-large', "cardiffnlp-twitter-roberta-base-sentiment",
          'xlm-roberta-large',
          'xlnet-base-cased', 'xlnet-large-cased',
          't5-base', 'tf-large',
          'gpt2-medium',
          'sentence-transformers/stsb-roberta-base']

MODEL_INDEX = -1
OUTPUT_PATH = '/model' + MODEL_NAMES[MODEL_INDEX] + '.bin'
MODEL_PATH =  '/model' + MODEL_NAMES[MODEL_INDEX] + '.pkl'


EVAL_FILE = 'evaluations.csv'
MODEL_RESULTS_FILE = 'model_results.csv'
PREDICTION_FILE = 'model_predictions.tsv'


### Using huggingface datasets

<mark> One of the 'HUGGINGFACE DATASETS' or 'LOCAL DATASETS' should be run

In [5]:
train_dataset = load_dataset('glue', 'stsb', split='train')
eval_dataset = load_dataset('glue', 'stsb', split='validation')
test_dataset = load_dataset('glue', 'stsb', split='test')


train = pd.DataFrame(train_dataset)
train.rename(columns = {'label':'labels'}, inplace = True)

eval = pd.DataFrame(eval_dataset)
eval.rename(columns = {'label':'labels'}, inplace = True)
# test = pd.DataFrame(test_dataset).filter([CONTENT_HEADER, INDEX_HEADER])
test = pd.DataFrame(test_dataset).filter([CONTENT_HEADERS[0], CONTENT_HEADERS[1], INDEX_HEADER])
test.rename(columns = {'label':'labels'}, inplace = True)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, post-processed: Unknown size, total: 1.86 MiB) to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [6]:
train

Unnamed: 0,sentence1,sentence2,labels,idx
0,A plane is taking off.,An air plane is taking off.,5.00,0
1,A man is playing a large flute.,A man is playing a flute.,3.80,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.80,2
3,Three men are playing chess.,Two men are playing chess.,2.60,3
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4
...,...,...,...,...
5744,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia,0.00,5744
5745,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...,0.00,5745
5746,President heading to Bahrain,President Xi: China to continue help to fight ...,0.00,5746
5747,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders,0.00,5747


### Using local files

In [8]:
train = pd.read_csv(TRAIN_PATH,sep='\t')
train[LABEL_HEADER] = train[LABEL_HEADER].progress_apply(lambda l: int(l))


eval = train.sample(n=5000,random_state=42)
train = train.drop(eval.index)
train = train.sample(n=30000,random_state=42)

train.head()

In [10]:
eval.head()

Unnamed: 0,sentence,label
66730,with outtakes in which most of the characters ...,0
29890,enigma is well-made,1
45801,is ) so stoked to make an important film about...,0
29352,the closest thing to the experience of space t...,1
19858,lose their luster,0


In [None]:
test = pd.read_csv(TEST_PATH,sep='\t')
test.head()

## Fetching Model

In [9]:
from transformers import DistilBertConfig, DistilBertTokenizerFast, DistilBertForSequenceClassification
model_path='distilbert-base-uncased'
config = DistilBertConfig.from_pretrained(model_path, num_labels=1)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [11]:
# MODELS[MODEL_INDEX]

'sentence-transformers/stsb-roberta-base'

In [12]:
# model = BertForSequenceClassification.from_pretrained(MODELS[MODEL_INDEX], num_labels=2)
# model = AutoModel.from_pretrained(MODELS[MODEL_INDEX])

Downloading:   0%|          | 0.00/672 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [13]:
# tokenizer = AutoTokenizer.from_pretrained(MODELS[MODEL_INDEX])

Downloading:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [10]:
stsb_train= datasets.Dataset.from_dict(train)
stsb_eval= datasets.Dataset.from_dict(eval)
stsb_test= datasets.Dataset.from_dict(test)

enc_train = stsb_train.map(lambda e: tokenizer( e[CONTENT_HEADERS[0]],e[CONTENT_HEADERS[1]], padding=True, truncation=True), batched=True, batch_size=1000) 
enc_val =   stsb_eval.map(lambda e: tokenizer( e[CONTENT_HEADERS[0]],e[CONTENT_HEADERS[1]], padding=True, truncation=True), batched=True, batch_size=1000) 
enc_test =  stsb_test.map(lambda e: tokenizer( e[CONTENT_HEADERS[0]],e[CONTENT_HEADERS[1]], padding=True, truncation=True), batched=True, batch_size=1000)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
pd.DataFrame(enc_train)

Unnamed: 0,sentence1,sentence2,labels,idx,input_ids,attention_mask
0,A plane is taking off.,An air plane is taking off.,5.00,0,"[101, 1037, 4946, 2003, 2635, 2125, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,A man is playing a large flute.,A man is playing a flute.,3.80,1,"[101, 1037, 2158, 2003, 2652, 1037, 2312, 8928...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.80,2,"[101, 1037, 2158, 2003, 9359, 14021, 5596, 209...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Three men are playing chess.,Two men are playing chess.,2.60,3,"[101, 2093, 2273, 2024, 2652, 7433, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4,"[101, 1037, 2158, 2003, 2652, 1996, 10145, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...
5744,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia,0.00,5744,"[101, 5729, 14554, 2015, 2004, 4040, 18856, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5745,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...,0.00,5745,"[101, 9877, 1997, 23437, 19323, 2579, 2011, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5746,President heading to Bahrain,President Xi: China to continue help to fight ...,0.00,5746,"[101, 2343, 5825, 2000, 15195, 102, 2343, 8418...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5747,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders,0.00,5747,"[101, 2859, 1010, 2634, 19076, 2000, 2582, 177...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [12]:
stsb_train.shape, stsb_eval.shape, stsb_test.shape

((5749, 4), (1500, 4), (1379, 3))

## Fine tuning

In [13]:
training_args = TrainingArguments(
    output_dir='./stsb-model', 
    do_train=True,
    do_eval=True,
    num_train_epochs=3,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=64,
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='epoch',                
    logging_dir='./logs',            
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    load_best_model_at_end=True
)

In [14]:
def compute_metrics(pred):
    preds = np.squeeze(pred.predictions) 
    return {"MSE": ((preds - pred.label_ids) ** 2).mean().item(),
            "RMSE": (np.sqrt ((  (preds - pred.label_ids) ** 2).mean())).item(),
            "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
     "Pearson" : pearsonr(preds,pred.label_ids)[0],
     "Spearman's Rank":spearmanr(preds,pred.label_ids)[0]
            }

In [15]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=enc_train,
        eval_dataset=enc_val,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

Using cuda_amp half precision backend


In [16]:
training_metrics = trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5749
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 540
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
1,2.5266,0.600605,0.600605,0.774987,0.602643,0.862816,0.858723
2,0.5283,0.529369,0.529369,0.727578,0.558907,0.875842,0.872464
3,0.295,0.537789,0.537789,0.733341,0.555885,0.874228,0.870658


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1500
  Batch size = 64
Saving model checkpoint to ./stsb-model/checkpoint-180
Configuration saved in ./stsb-model/checkpoint-180/config.json
Model weights saved in ./stsb-model/checkpoint-180/pytorch_model.bin
tokenizer config file saved in ./stsb-model/checkpoint-180/tokenizer_config.json
Special tokens file saved in ./stsb-model/checkpoint-180/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.for

In [17]:
evaluate_metrics = trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1500
  Batch size = 64


In [18]:
training_metrics

TrainOutput(global_step=540, training_loss=1.1166456787674515, metrics={'train_runtime': 138.3185, 'train_samples_per_second': 124.69, 'train_steps_per_second': 3.904, 'total_flos': 556850672458002.0, 'train_loss': 1.1166456787674515, 'epoch': 3.0})

## Evaluation

In [19]:
evaluate_metrics

{'eval_loss': 0.5293694734573364,
 'eval_MSE': 0.5293694734573364,
 'eval_RMSE': 0.727577805519104,
 'eval_MAE': 0.5589066743850708,
 'eval_Pearson': 0.875841960904283,
 "eval_Spearman's Rank": 0.8724640918727458,
 'eval_runtime': 2.6436,
 'eval_samples_per_second': 567.412,
 'eval_steps_per_second': 9.079,
 'epoch': 3.0}

## Prediction

In [20]:
q=[trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]]
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5749
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1500
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `Disti

Unnamed: 0,eval_loss,eval_MSE,eval_RMSE,eval_MAE,eval_Pearson
train,0.251669,0.251669,0.501666,0.385974,0.942445
val,0.529369,0.529369,0.727578,0.558907,0.875842
test,,,,,


In [21]:
predict_metrics = trainer.predict(test_dataset=enc_test)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, sentence1, idx. If sentence2, sentence1, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1379
  Batch size = 64


In [29]:
predict_metrics[0]
predict_metrics = [p[0] for p in predict_metrics[0]]

In [33]:
test[LABEL_HEADER] = predict_metrics
prediction_df = test.filter(['idx', 'label'])
prediction_df

Unnamed: 0,idx,label
0,0,3.001953
1,1,3.996094
2,2,4.351562
3,3,4.417969
4,4,1.543945
...,...,...
1374,1374,0.792480
1375,1375,1.231445
1376,1376,1.829102
1377,1377,0.468018


In [34]:
prediction_df.to_csv(PREDICTION_FILE, index=False)