In [7]:

import os
import gc
from tqdm.auto import tqdm
import transformers
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from plotly.subplots import make_subplots #graphing
import plotly.express as px #graphing
import plotly.graph_objects as go #graphing
import plotly.figure_factory as ff #graphing

from text_unidecode import unidecode
from typing import Dict, List, Tuple
from datasets import concatenate_datasets,load_dataset,load_from_disk
from sklearn.metrics import log_loss
from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from torch.nn.parameter import Parameter

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from spellchecker import SpellChecker
from nltk.corpus import stopwords


import time
import warnings
import collections
import spacy
import re
# from termcolor import colored

warnings.filterwarnings("ignore")

In [15]:
"""import nltk
nltk.download('stopwords')"""

"""import en_core_web_sm
nlp = en_core_web_sm.load()"""
tqdm.pandas()

In [2]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging
)

In [4]:
# ====================================================
# Utils
# ====================================================


class cfg:
    select = 'base'
    #model_name = f'/kaggle/input/deberta-v3-{select}/deberta-v3-{select}'
    model_name = f'./Models/deberta-v3-{select}'

    only_model_name = f'deberta-v3-{select}'
    accum_iter = 16
    fold = 4
    split = 5
    seed = 42
    batch_size = 4
    max_len = 512
    num_epoch = 10
    T_max= 500
    
    scheduler = 'CosineAnnealingLR'
    weight_decay =  1e-6
    min_lr = 1e-6
    freezing = False
    pooling = 'GemText'
    weight_decay = 1e-2
    encoder_lr = 1e-5
    decoder_lr = 1e-5
    eps = 1e-6
    betas = (0.9, 0.999)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

def get_logger(filename='Training'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg.seed)


LOGGER.info(f"=========================== Model name :{cfg.only_model_name} ===========================: ")
LOGGER.info('\n')
LOGGER.info(f"Scheduler: {cfg.scheduler}")
LOGGER.info(f"batch_size: {cfg.batch_size} with gradient Accumukation {cfg.accum_iter} ")
LOGGER.info(f"Pooling name: {cfg.pooling} ")
LOGGER.info(f"Freezing: {cfg.freezing}")
LOGGER.info(f"Max Length: {cfg.max_len}")
LOGGER.info(f"Num Epochs: {cfg.num_epoch}")
LOGGER.info('\n')



Scheduler: CosineAnnealingLR
batch_size: 4 with gradient Accumukation 16 
Pooling name: GemText 
Freezing: False
Max Length: 512
Num Epochs: 10




In [5]:
"""train_prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
test_prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
train_data = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
test_data = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')"""

train_prompts = pd.read_csv('./Data/prompts_train.csv')
test_prompts = pd.read_csv('./Data/prompts_test.csv')
submission = pd.read_csv('./Data/sample_submission.csv')
train_data = pd.read_csv('./Data/summaries_train.csv')
test_data = pd.read_csv('./Data/summaries_test.csv')

print(f"Prompt Train.shape: {train_prompts.shape}")
display(train_prompts.head())
print(f"Summary Train.shape: {train_data.shape}")
display(train_data.head())

Prompt Train.shape: (4, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


Summary Train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [13]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = SpellChecker() #Speller(lang='en')
        
    def count_text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
        """ text length """
        tokenizer=self.tokenizer
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)
        intersecting_ners = prompt_ner.intersection(summary_ner)
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}
 
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.speller.unknown(wordlist)))

        return amount_miss
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )

        )
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)

        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        
        # Crate dataframe with count of each category NERs overlap for all the summaries
        # Because it spends too much time for this feature, I don't use this time.
#         ners_count_df  = input_df.progress_apply(
#             lambda row: pd.Series(self.ner_overlap_count(row, mode=mode), dtype='float64'), axis=1
#         ).fillna(0)
#         self.ner_keys = ners_count_df.columns
#         ners_count_df['sum'] = ners_count_df.sum(axis=1)
#         ners_count_df.columns = ['NER_' + col for col in ners_count_df.columns]
#         # join ner count dataframe with train dataframe
#         input_df = pd.concat([input_df, ners_count_df], axis=1)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=cfg.model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
train_data = preprocessor.run(train_prompts, train_data, mode="train")
test_data = preprocessor.run(test_prompts, test_data, mode="test")

test_data.head()

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,trigram_overlap_count,quotes_count
0,000000ffffff,abc123,Example text 1,5,0,Summarize...,Example Title 1,Heading\nText...,7,0.714286,0,0,0,0
1,111111eeeeee,def789,Example text 2,5,0,Summarize...,Example Title 2,Heading\nText...,7,0.714286,0,0,0,0
2,222222cccccc,abc123,Example text 3,5,0,Summarize...,Example Title 1,Heading\nText...,7,0.714286,0,0,0,0
3,333333dddddd,def789,Example text 4,5,0,Summarize...,Example Title 2,Heading\nText...,7,0.714286,0,0,0,0


In [20]:
"""fold = StratifiedKFold(n_splits=cfg.fold, shuffle=True, random_state=cfg.seed)
for n, (train_index, val_index) in enumerate(fold.split(train, train['prompt_id'])):
    train.loc[val_index, 'fold'] = n
train['fold'] = train['fold'].astype(int)
fold_sizes = train.groupby('fold').size()
print(fold_sizes)"""

gkf = GroupKFold(n_splits = cfg.fold)

for i, (_, val_index) in enumerate(gkf.split(train_data, groups = train_data['prompt_id'])):
    train_data.loc[val_index, 'fold'] = i
    
train_data.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,fold,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,trigram_overlap_count,quotes_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,69,5,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,0.102832,0,5,0,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,56,2,2.0,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1137,0.049252,0,22,10,0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,285,32,1.0,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,651,0.437788,1,56,26,2
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,43,5,1.0,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,651,0.066052,1,10,6,0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,253,29,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,0.377049,1,27,5,4


In [22]:
max_words_text = train_data["text"].apply(lambda x: len(x.split())).max()
max_words_prompt_question = train_prompts["prompt_question"].apply(lambda x: len(x.split())).max()
max_words_prompt_text = train_prompts["prompt_text"].apply(lambda x: len(x.split())).max()

## max words
max_words_text, max_words_prompt_question, max_words_prompt_text

(647, 27, 966)

In [23]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
cfg.tokenizer = tokenizer
cfg.tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DebertaV2TokenizerFast(name_or_path='./Models/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [24]:
train_df = train_data.merge(train_prompts, on='prompt_id')
train_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,fold,prompt_question_x,prompt_title_x,...,length_ratio,word_overlap_count,bigram_overlap_count,trigram_overlap_count,quotes_count,prompt_question_y,prompt_title_y,prompt_text_y,prompt_length_y,prompt_tokens
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,69,5,3.0,Summarize how the Third Wave developed over su...,The Third Wave,...,0.102832,0,5,0,0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment..."
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,253,29,3.0,Summarize how the Third Wave developed over su...,The Third Wave,...,0.377049,1,27,5,4,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment..."
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,75,6,3.0,Summarize how the Third Wave developed over su...,The Third Wave,...,0.111773,0,17,8,1,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment..."
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,93,14,3.0,Summarize how the Third Wave developed over su...,The Third Wave,...,0.138599,1,26,10,0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment..."
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,32,3,3.0,Summarize how the Third Wave developed over su...,The Third Wave,...,0.04769,1,5,0,0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,"[▁Background, ▁The, ▁Third, ▁Wave, ▁experiment..."


In [None]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.pq = df['prompt_question'].values
        self.pt = df['prompt_title'].values
        self.text = df['text'].values
        self.targets = df[['content' , 'wording']].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        text =   self.text[index]
        pt = self.pt[index]
        # full_text = pq +" " + self.tokenizer.sep_token +" "+text
        full_text = pt +" " + self.tokenizer.sep_token +" "+ pq + " " + self.tokenizer.sep_token + " " +text
        # full_text = text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                        
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.targets[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float)

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs


In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e9
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings

In [None]:
class MeanMax(nn.Module):
    def __init__(self):
        super(MeanMax, self).__init__()
        
        self.mean_pooler = MeanPooling()
        self.max_pooler  = MaxPooling()
        
    def forward(self, last_hidden_state, attention_mask):
        mean_pooler = self.mean_pooler( last_hidden_state ,attention_mask )
        max_pooler =  self.max_pooler( last_hidden_state ,attention_mask )
        out = torch.concat([mean_pooler ,max_pooler ] , 1)
        return out
    

In [None]:
class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

In [None]:
def get_pooling_layer():
    if cfg.pooling == 'Mean':
        return MeanPooling()
    
    elif cfg.pooling == 'Max':
        return MaxPooling()
    
    elif cfg.pooling == 'MeanMax':
        return MeanMax()
    
    elif cfg.pooling == 'GemText':
        return GeMText()


print(get_pooling_layer())

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

In [None]:
def odd_layer_freeze(module):
    for i in range(1,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def even_layer_freeze(module):
    for i in range(0,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def top_half_layer_freeze(module):
    for i in range(0,13,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

def bottom_half_layer_freeze(module):
    for i in range(13,14,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
    

In [None]:

'''
## Check layers which one are freeze 
for n,p in model.named_parameters():
    print(n,p.requires_grad)
'''

In [None]:

#if cfg.freezing:
#    top_half_layer_freeze(model)

In [None]:
class BaselineModel(nn.Module):
    def __init__(self, model_name ):
        super(BaselineModel, self).__init__()
        
        self.model = AutoModel.from_pretrained(cfg.model_name)
        self.config = AutoConfig.from_pretrained(cfg.model_name)
        #self.drop = nn.Dropout(p=0.2)
        self.pooler = get_pooling_layer()

        if cfg.pooling == 'MeanMax':
            self.fc = nn.Linear(2*self.config.hidden_size, 2)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 2)
            
        
        self._init_weights(self.fc)
        
        if cfg.freezing:
            top_half_layer_freeze(self.model)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
           
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        #out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [None]:
def train_run(model ,criterion ,optimizer , dataloader):
    
    model.train()
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    running_loss = 0.0
    dataset_size = 0.0 
    
    
    for batch_idx , (data , labels) in bar:
        inputs , target = collate(data) , labels    
        ids  =  inputs['input_ids'].to(cfg.device, dtype = torch.long)
        mask = inputs['attention_mask'].to(cfg.device, dtype = torch.long)
        targets = target.to(cfg.device, dtype = torch.float)
        
        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        
        # normalize loss to account for batch accumulation
        loss = loss / cfg.accum_iter 
        loss.backward()
        
        if ((batch_idx + 1) % cfg.accum_iter == 0) or (batch_idx + 1 == len(dataloader)):
            optimizer.step()
            optimizer.zero_grad()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

    epoch_loss = running_loss/dataset_size
    gc.collect()
    

    
    return epoch_loss


In [None]:
@torch.no_grad()
def valid_run(model , dataloader):
    model.eval()
    
    running_loss = 0.0
    dataset_size = 0.0
    
    predictions = []
    y_labels = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for batch_idx , (data , labels) in bar:
        inputs , target = collate(data) , labels
        ids  =  inputs['input_ids'].to(cfg.device, dtype = torch.long)
        mask = inputs['attention_mask'].to(cfg.device, dtype = torch.long)
        targets = target.to(cfg.device, dtype = torch.float)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        predictions.append(outputs.detach().to('cpu').numpy())
        y_labels.append(labels.detach().to('cpu').numpy())
    
    predictions = np.concatenate(predictions)
    y_labels    = np.concatenate(y_labels)
    epoch_loss = running_loss / dataset_size
    gc.collect()   
    
    return epoch_loss , predictions , y_labels
        
    

In [None]:
def prepare_fold(fold):
    
    dftrain = train_df[train_df['fold']!= fold]
    dfvalid = train_df[train_df['fold']== fold]
    
    train_dataset = TrainDataset(dftrain)
    valid_dataset = TrainDataset(dfvalid)
    
    train_loader = DataLoader(train_dataset , batch_size=cfg.batch_size ,num_workers=2, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset ,batch_size=cfg.batch_size,num_workers=2, shuffle=True, pin_memory=True)
    
    return train_loader , valid_loader
    

In [None]:
train_loader , valid_loader = prepare_fold(0)

In [None]:
def oof_df(n_fold , true , pred):
    
    df_pred = pd.DataFrame(pred ,columns= ['pred_content' , 'pred_wording'] )
    df_real = pd.DataFrame(true ,columns= ['content' , 'wording'] )
    
    df = pd.concat([df_real , df_pred ],1)

    
    return df
    


In [None]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
              'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

In [None]:
oof_dfs = []
for n_fold in range(cfg.fold):
    LOGGER.info('\n')
    LOGGER.info(f"========== fold: {n_fold} training ==========")
    train_loader, valid_loader = prepare_fold(fold=n_fold)
    LOGGER.info(f'Number of batches in Train {len(train_loader) } and valid {len(valid_loader)} dataset')
    model  = BaselineModel(cfg.model_name).to(cfg.device)   
    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)

    optimizer = AdamW(optimizer_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=cfg.T_max, 
                                                   eta_min=cfg.min_lr)    
    
   
    criterion = nn.SmoothL1Loss(reduction='mean')
    
    start = time.time()
    best_epoch_score = np.inf
    for epoch in range(cfg.num_epoch):
        
        train_loss  = train_run(model ,criterion ,optimizer , dataloader=train_loader)
        valid_loss , valid_preds , valid_labels  = valid_run(model , dataloader=valid_loader)
        
        if valid_loss < best_epoch_score:
            
            LOGGER.info(f"Validation Loss Improved ({best_epoch_score} ---> {valid_loss})")
            best_epoch_score = valid_loss
            ### saving weights
            torch.save(model.state_dict(), f"{cfg.only_model_name}_Fold_{n_fold}.pth") 
            
            ## saving oof values
            df_ = oof_df(n_fold , valid_labels , valid_preds)
            
            LOGGER.info(f'Weights and oof values saved for epochs-{epoch} .....')
            
        LOGGER.info(f"Epoch {epoch} Training Loss {np.round(train_loss , 4)} Validation Loss {np.round(valid_loss , 4)}")
    
        
    end = time.time()
    time_elapsed = end - start
    
    LOGGER.info(' Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    
    LOGGER.info(" Best Loss: {:.4f}".format(best_epoch_score))
    
    oof_dfs.append(df_)            
    LOGGER.info(f" oof for fold {n_fold} ---> {score_loss(valid_labels, valid_preds )}")
    del model, train_loader, valid_loader , df_ , valid_preds , valid_labels
    gc.collect()
    LOGGER.info('\n')

In [None]:
oof_df = pd.concat(oof_dfs , ignore_index=True )
oof_df.to_csv('oof_df.csv' , index = False)

In [None]:
y_trues_final = np.array([oof_df["content"].to_list(), oof_df['wording'].to_list()]).T
y_preds_final  = np.array([oof_df["pred_content"].to_list(), oof_df["pred_wording"].to_list()]).T

In [None]:
print(MCRMSE(y_trues_final, y_preds_final))