In [None]:
! pip install --quiet  transformers
! pip install --quiet  sentencepiece
! pip install --quiet  datasets
! pip install --quiet  evaluate
! pip install --quiet git+https://github.com/google-research/bleurt.git

In [None]:
import transformers
print(f'transformers.__version__: {transformers.__version__}')

In [None]:
import torch 
import gc 
import pandas as pd 
import numpy as np
from transformers import AdamW,T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer , Adafactor
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import Adam
from transformers.optimization import Adafactor, AdafactorSchedule
from torch.cuda import amp 
import numpy as np
import torch 
import torch.nn as nn 
import sys
from tqdm.notebook import tqdm 
import os 
import warnings
from sklearn.metrics import mean_squared_error
from transformers import get_cosine_schedule_with_warmup , get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split 
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')
np.random.seed(42)
torch.manual_seed(42)

In [None]:
class CFG:
    train_lr = 1e-4
    encoder_length = 512
    decoder_length = 120
    model='t5-base'
    train_batch_size = 8
    cv_batch_size = 16
    epochs = 5
    tokenizer= T5Tokenizer.from_pretrained(model)
    clip=1.0

In [None]:
# sourec text 
source_df = pd.read_csv('/content/drive/MyDrive/ts_competition/source_texts.csv')
train_df = pd.read_csv('/content/drive/MyDrive/ts_competition/train.csv')

In [None]:
# eassy text 
source_df.text.map(lambda x: len(x)).max()

In [None]:
source_df[source_df['source_title']=='a-fish-story'] # yuletide-specters

In [None]:
train_df[train_df['source_title']=='yuletide-specters']['answer']

In [None]:
source_df[(source_df['source_title']=="remarkable-rocket") &  (source_df['cor_section']==5)]

In [None]:
text_list = []
for idx , data in enumerate(train_df[['source_title','cor_section']].values):
    try:
        if len(data[1]) >= 2:
            cor_section_concat=''

            for x in data[1].split(','):
                cor_section_concat=cor_section_concat+source_df[(source_df['source_title']==data[0]) & (source_df['cor_section']==int(x))].text.item()
            text_list.append(cor_section_concat)
        else:
           
            text_list.append(source_df[(source_df['source_title']==data[0]) & (source_df['cor_section']==int(data[1]))].text.item())
    except Exception as e:
        print(e)
        print(idx)
        print(data)
       

                       

In [None]:
train_df.iloc[2999]['question']

In [None]:
text_list.insert(2999,source_df.iloc[1150]['text'])

In [None]:
train_df.loc[:,'text']=text_list

In [None]:
SEP_TOKEN='*'

In [None]:
max_length = 0 
for  question in train_df['question'].values:

    output = CFG.tokenizer.encode(question)

    if len(output) > max_length:
        max_length = len(output)

CFG.decoder_length = max_length  
print(f'The Max length of the decoder : {CFG.decoder_length}') 

In [None]:
class QADataset():
    def __init__(self,text, question , answer):
        self.text = text
        self.question = question
        self.answer=answer

    def __len__(self):
        return len(self.text+self.question+self.answer)
    
    def __getitem__(self,item):
        
        text = self.text[item]

        question = self.question[item]

        answer = self.answer[item]

        # Encoder tokens 

        encoder_tokens = CFG.tokenizer(SEP_TOKEN+answer+SEP_TOKEN,text,max_length = CFG.encoder_length,padding='max_length',truncation='only_second',add_special_tokens=True,return_attention_mask=True,return_tensors='pt')

        input_ids, attention_mask = encoder_tokens.input_ids, encoder_tokens.attention_mask

        decoder_toekns= CFG.tokenizer(question,max_length = CFG.decoder_length ,padding='max_length',add_special_tokens=True,return_attention_mask=True,return_tensors='pt')


        labels = decoder_toekns.input_ids

        labels[labels == CFG.tokenizer.pad_token_id] = -100


        return {

            'input_ids':input_ids.flatten(),
            'attention_mask': attention_mask.flatten(),
            'labels':labels.flatten()

        }





In [None]:
class T5QAModel(nn.Module):
    def __init__(self):
        super(T5QAModel,self).__init__()
        self.model=T5ForConditionalGeneration.from_pretrained(CFG.model,return_dict=True)
    def forward(self,input_ids,attention_mask,labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        return output.loss , output.logits 


In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
Model=T5QAModel()

In [None]:
Model.to(DEVICE)

In [None]:
total_params = sum(p.numel() for p in Model.parameters() if p.requires_grad)
print(f'The No.of trainable parameters : {total_params}')

In [None]:
from datasets import load_metric

bleurt = load_metric("bleurt", module_type="metric")

In [None]:
def outputstring_cal(predicted_tokens):
    if predicted_tokens.dim()==1:
        output_tokens=[]
        for token_ids in predicted_tokens:
            data_list=[]
            data_list.append(token_ids)
            if int(token_ids)==1:
                break
            else:
              output_tokens.append(CFG.tokenizer.decode(data_list))      
        return ' '.join(output_tokens)       
    elif predicted_tokens.dim()==2 :
        return [outputstring_cal(predicted_tokens[i, :]) for i in range(predicted_tokens.size(0))]

    raise RuntimeError(f' The dimesion should be 2 but we received {predicted_tokens.dim()} ')


In [None]:
def train_fn(train_data,optimizer,scheduler=None): 
    Model.train()
    total_loss=0

    train = tqdm(train_data, total=len(train_data))
    
    for batch_size , data in enumerate(train):


        input_ids=data['input_ids']
        attention_mask = data['attention_mask']
        labels = data['labels']

        input_ids = input_ids.to(DEVICE)

        attention_mask = attention_mask.to(DEVICE)

        labels = labels.to(DEVICE)

        optimizer.zero_grad()
    

        loss,_=Model(input_ids,attention_mask,labels)

        total_loss+=loss.item()

        loss.backward()

        nn.utils.clip_grad_norm_(Model.parameters(), CFG.clip)

        optimizer.step()


        perplexity = np.exp(total_loss / len(train_data))
        
    return  perplexity

In [None]:
 
def eval_fn(val_data):
    Model.eval()
    total_loss=0

    hypotheses=[]

    actual = []
 
    with torch.no_grad():
        tk = tqdm(val_data, total=len(val_data)) 
        for batch_size , data in enumerate(tk):


            input_ids=data['input_ids']
            attention_mask = data['attention_mask']
            labels = data['labels']

            input_ids = input_ids.to(DEVICE)

            attention_mask = attention_mask.to(DEVICE)

            labels = labels.to(DEVICE)

            loss, logits =Model(input_ids,attention_mask,labels)

          
            total_loss+=loss.item()

            output = logits.argmax(dim=-1)

            predicited_string = outputstring_cal(output)

          

            actual_string= outputstring_cal(labels)

            hypotheses.extend(predicited_string)

            actual.extend(actual_string)

          
    perplexity = np.exp(total_loss / len(val_data))
  
    bleurt_metric = bleurt.compute(predictions=hypotheses, references=actual)

    
    return perplexity , np.mean(bleurt_metric['scores'],axis=0)

In [None]:
from torch.optim import Adam
def run():

    df_train , df_valid= train_test_split(train_df,test_size=0.3, random_state=42)

    df_train=df_train.reset_index(drop=True)
    df_valid=df_valid.reset_index(drop=True)

    train_data=QADataset(
    text=df_train.text.values,
    question=df_train.question.values,
    answer=df_train.answer.values 
    
    )

    train_data_loader=torch.utils.data.DataLoader(
    train_data,
    batch_size=CFG.train_batch_size
     )
    
    val_data_loader=QADataset(
    text=df_valid.text.values,
    question=df_valid.question.values,
    answer=df_valid.answer.values 
    
    )

  
    validation_data_loader=torch.utils.data.DataLoader(
    val_data_loader,
    batch_size=CFG.cv_batch_size
    )

    num_train_steps = int(len(train_data) /CFG.train_batch_size)* CFG.epochs

    param_optimizer = list(Model.model.named_parameters())

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    
    optimized_paramater=[
        
            {
        "params": [
            p for n, p in param_optimizer  if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.01,
    },    {
        "params": [
            p for n, p in param_optimizer  if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
    ]

    optimizer = Adam(optimized_paramater, lr=CFG.train_lr)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=100, 
        num_training_steps=num_train_steps
    )

    best_bleurt = float('-inf')
    es_patience = 3
    patience = 0 
    model_path = f'/content/drive/MyDrive/T5Model/model-t5-small-new-{CFG.train_lr}.pth'

    for i in range(CFG.epochs):


        print("Epoch {}/{}".format(i+1,CFG.epochs))

        train_prexlity=train_fn(train_data_loader,optimizer,scheduler=scheduler)

        test_prexlity , bleurt_metric = eval_fn( validation_data_loader)
  
        print(f'Epoch :{i+1} and  train prexlity {train_prexlity:.2f}')

        print(f'Epoch :{i+1} and  test prexlity {test_prexlity:.2f} and bleurt_metric : {bleurt_metric:.2f}' )


        is_best = bleurt_metric > best_bleurt 
        if is_best:
            print(f'score improved ({best_bleurt:.4f} -> {bleurt_metric:.4f}). Saving Model!')
            best_bleurt = bleurt_metric
            patience = 0
            torch.save(Model.state_dict(), model_path)
        else:
            patience += 1
            print(f'Early stopping counter: {patience} out of {es_patience}')
            if patience == es_patience:
                print(f'Early stopping! Best score: {best_bleurt:.4f}')
                break

In [None]:
run()

In [None]:
import gc 
del Model
gc.collect()
torch.cuda.empty_cache()


# Evaluate Model

In [None]:
import gc 
del Model
gc.collect()
torch.cuda.empty_cache()

In [None]:
Model = T5QAModel()
Model.load_state_dict(torch.load(f'/content/drive/MyDrive/T5Model/model-t5-base-0.0001.pth',map_location=torch.device('cpu')))
Model.to(DEVICE)
Model.eval()

In [None]:
@torch.no_grad()
def predictedQA(Model,answer , text):
    
    
    encoder_tokens = CFG.tokenizer(SEP_TOKEN+answer+SEP_TOKEN,text,max_length = CFG.encoder_length,padding='max_length',truncation='only_second',add_special_tokens=True,return_attention_mask=True,return_tensors='pt')

    input_ids, attention_mask = encoder_tokens.input_ids, encoder_tokens.attention_mask


    output_ids = Model.model.generate(
        input_ids=input_ids.to(DEVICE),
        attention_mask=attention_mask.to(DEVICE),
        num_beams=1,
        max_length=CFG.decoder_length,
        repetition_penalty=2.5,
        length_penalty=1.0,
        use_cache=True
    )


    preds = {
        CFG.tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for ids in output_ids
    }

    return ''.join(preds)

In [None]:
def result(generated , answer, context, original_question):
    print('Generated: ', generated)
    if original_question:
        print('Original : ', original_question)

    print()
    print('Answer: ', answer)
    print('Conext: ', context)
    print('*'*100)

In [None]:
random_records=np.random.randint(1,6000,10)

In [None]:
for x in random_records:

    print(f'The record number : {x}')

    sample_question = train_df.iloc[x]

    generated=predictedQA(Model, sample_question['answer'], sample_question['text'])

    result(generated, sample_question['answer'], sample_question['text'], sample_question['question'])

In [None]:
count=0
predicited=[]
actual =[]
for x in train_df.index:
    generated=predictedQA(Model, train_df['answer'][x], train_df['text'][x])
    predicited.append(generated)
    actual.append(train_df['question'][x])
    count+=1
    if count==100:
        break
        print("Break encountred")

In [None]:
import evaluate
import string
import re

# BLEURT functions
def normalize(text):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(text))))

def grade_score(df):
    nls = []
    for curit, (q, gq) in enumerate(zip(df['reference_question'], df['generated_question'])):
        if curit == 100:
            break
        else:
            result = bleurt.compute(predictions=[normalize(q)], references=[normalize(gq)])
            nls.append(result)
    return nls

bleurt = evaluate.load('bleurt', 'bleurt-20')

In [None]:
df=pd.DataFrame(list(zip(predicited,actual)),columns=['generated_question','reference_question'])

In [None]:
output = grade_score(df)

In [None]:
score= np.mean([x['scores'][0] for x in output],axis=0)
print(f'The Bluert score for the 100 data points : {score:.2f}')