## Objective:
    -1. Finetune BERT model using glance training data

## Loading Imports

In [None]:
!pip3 install -U pip setuptools 
!pip3 install -U sentence-transformers
!pip3 install -U torch torchvision

In [2]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import logging
from datetime import datetime
import csv
from typing import Union, List
import gzip
import os

In [3]:


class InputExample:
    """
    Structure for one input example with texts, the label and a unique id
    """
    def __init__(self, guid: str, texts: List[str], label: Union[int, float]):
        """
        Creates one InputExample with the given texts, guid and label
        str.strip() is called on both texts.
        :param guid
            id for the example
        :param texts
            the texts for the example
        :param label
            the label for the example
        """
        self.guid = guid
        self.texts = [text.strip() for text in texts]
        self.label = label

In [4]:
## Class to read training data for finetuning

class DataReader(object):
    """
    Reads in the dataset
    """
    def __init__(self, dataset_folder):
        self.dataset_folder = dataset_folder

    def get_examples(self, filename, max_examples=0):
        
        """
        Loads three files, the first the the sentence 1 the second with
        sentence 2 and the last with the label connecting the sentence 1 and sentence 2
       
        """
        s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
                       mode="rt", encoding="utf-8").readlines()
        s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
                       mode="rt", encoding="utf-8").readlines()
        labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
                           mode="rt", encoding="utf-8").readlines()
        examples = []
        id = 0
        for sentence_a, sentence_b, label in zip(s1, s2, labels):
            guid = "%s-%d" % (filename, id)
            id += 1
            examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))

            if 0 < max_examples <= len(examples):
                break

        return examples

    @staticmethod
    def get_labels():
        return {"contradiction": 0, "entailment": 1}

    def get_num_labels(self):
        return len(self.get_labels())

    def map_label(self, label):
        return self.get_labels()[label.strip().lower()]

In [5]:
## Class to read Semantic Textual Similarity data for evaluation

class STSDataReader:
    """
    Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
    """
    def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
                 quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
        self.dataset_folder = dataset_folder
        self.score_col_idx = score_col_idx
        self.s1_col_idx = s1_col_idx
        self.s2_col_idx = s2_col_idx
        self.delimiter = delimiter
        self.quoting = quoting
        self.normalize_scores = normalize_scores
        self.min_score = min_score
        self.max_score = max_score

    def get_examples(self, filename, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
                          delimiter=self.delimiter, quoting=self.quoting)
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples

## Finetuning function

In [6]:
def finetune_base_bert_model(base_model='bert-base-uncased',batch_size=20,num_epochs=4,evaluation_steps=1000,train_data_path_folder='Bert_Finetune_data/Training/' ,
                             train_data_path_file='train_1.gz',eval_data_path_folder='Bert_Finetune_data/Eval_sts/',eval_data_path_file='sts-dev.csv',
                            model_save_path='Bert_Finetune_data/Bert_fine_tune_results_nli_mean_base/Results/'):
    
    if base_model =='bert-base-uncased':
        
        print("Finetuning base uncased model")
        
        # Read the dataset
        model_name = base_model
        batch_size = batch_size

        # Use BERT for mapping tokens to embeddings
        word_embedding_model = models.BERT(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                       pooling_mode_mean_tokens=True,
                                       pooling_mode_cls_token=False,
                                       pooling_mode_max_tokens=False)

        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        
        print("Reading Training data")
        
        nli_reader = DataReader(train_data_path_folder)

        train_num_labels = 2
        
        train_data = SentencesDataset(nli_reader.get_examples(train_data_path_file), model=model)
        train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

        print("Reading Eval data")

        sts_reader = STSDataReader(eval_data_path_folder)

        dev_data = SentencesDataset(examples=sts_reader.get_examples(eval_data_path_file), model=model)
        dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
        
        model_save_path = model_save_path
        
        print("Starting Training..")

        # Configure the training
        num_epochs = num_epochs

        warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up


        # Train the model
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluator,
                  epochs=num_epochs,
                  evaluation_steps=evaluation_steps,
                  warmup_steps=warmup_steps,
                  output_path=model_save_path
                  )
        
        model_fine_tuned_base = SentenceTransformer(model_save_path)
        
        print("Finished finetuning of base model")

        return model

        
    else:
        
        # Read the dataset
        model_name = base_model
        
        train_batch_size = batch_size
        
        # Load a pre-trained sentence transformer model
        model = SentenceTransformer(model_name)
        
        print("Finetuning base model")

        
        print("Reading Training data")

        nli_reader = DataReader(train_data_path_folder)
        
        train_num_labels = 2
        
        train_data = SentencesDataset(nli_reader.get_examples(train_data_path_file), model=model)
        train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

        print("Reading Eval data")

        sts_reader = STSDataReader(eval_data_path_folder)

        dev_data = SentencesDataset(examples=sts_reader.get_examples(eval_data_path_file), model=model)
        dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
        
        model_save_path = model_save_path
        
        print("Starting Training..")

        
        # Configure the training
        num_epochs = num_epochs

        warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up


        # Train the model
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluator,
                  epochs=num_epochs,
                  evaluation_steps=evaluation_steps,
                  warmup_steps=warmup_steps,
                  output_path=model_save_path
                  )
                
        print("Finished finetuning of base model")
        
        return model