In [1]:

import os
import gc
from tqdm.auto import tqdm
import transformers
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import plotly.express as px #graphing
import plotly.graph_objects as go #graphing
from plotly.subplots import make_subplots #graphing
import plotly.figure_factory as ff #graphing
from torch.nn.parameter import Parameter
from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification


from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
from datasets import concatenate_datasets,load_dataset,load_from_disk

from sklearn.metrics import log_loss

from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import time
import warnings
import collections
# from termcolor import colored

from torch.optim import lr_scheduler
warnings.filterwarnings("ignore")

In [2]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    

In [3]:
import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging
)

In [4]:
# ====================================================
# Utils
# ====================================================


class cfg:
    select = 'base'
    # model_name = f'/kaggle/input/deberta-v3-{select}/deberta-v3-{select}'
    model_name = f'./Models/deberta-v3-{select}'

    only_model_name = f'deberta-v3-{select}'
    accum_iter = 16
    fold = 4
    split = 5
    seed = 42
    batch_size = 2
    max_len = 512
    num_epoch = 1
    T_max= 500
    
    scheduler = 'CosineAnnealingLR'
    weight_decay =  1e-6
    min_lr = 1e-6
    freezing = False
    pooling = 'GemText'
    weight_decay = 1e-2
    encoder_lr = 1e-5
    decoder_lr = 1e-5
    eps = 1e-6
    betas = (0.9, 0.999)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

def get_logger(filename='Training'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg.seed)


LOGGER.info(f"=========================== Model name :{cfg.only_model_name} ===========================: ")
LOGGER.info('\n')
LOGGER.info(f"Scheduler: {cfg.scheduler}")
LOGGER.info(f"batch_size: {cfg.batch_size} with gradient Accumukation {cfg.accum_iter} ")
LOGGER.info(f"Pooling name: {cfg.pooling} ")
LOGGER.info(f"Freezing: {cfg.freezing}")
LOGGER.info(f"Max Length: {cfg.max_len}")
LOGGER.info(f"Num Epochs: {cfg.num_epoch}")
LOGGER.info('\n')



Scheduler: CosineAnnealingLR
batch_size: 2 with gradient Accumukation 16 
Pooling name: GemText 
Freezing: False
Max Length: 512
Num Epochs: 1




In [5]:
"""train_prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
test_prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
train_data = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
test_data = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')"""

train_prompts = pd.read_csv('./Data/prompts_train.csv')
test_prompts = pd.read_csv('./Data/prompts_test.csv')
submission = pd.read_csv('./Data/sample_submission.csv')
train_data = pd.read_csv('./Data/summaries_train.csv')
test_data = pd.read_csv('./Data/summaries_test.csv')

print(f"Prompt Train.shape: {train_prompts.shape}")
display(train_prompts.head())
print(f"Summary Train.shape: {train_data.shape}")
display(train_data.head())



Prompt Train.shape: (4, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


Summary Train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [6]:
"""fold = StratifiedKFold(n_splits=cfg.fold, shuffle=True, random_state=cfg.seed)
for n, (train_index, val_index) in enumerate(fold.split(train, train['prompt_id'])):
    train.loc[val_index, 'fold'] = n
train['fold'] = train['fold'].astype(int)
fold_sizes = train.groupby('fold').size()
print(fold_sizes)"""

gkf = GroupKFold(n_splits = cfg.fold)

for i, (_, val_index) in enumerate(gkf.split(train_data, groups = train_data['prompt_id'])):
    train_data.loc[val_index, 'fold'] = i
    
train_data.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,3.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,2.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,3.0


In [7]:
for i, (train_index, val_index) in enumerate(gkf.split(train_data, groups = train_data['prompt_id'])):
    print(i)
    print(train_index)
    print(val_index)

0
[   0    1    2 ... 7161 7162 7164]
[   8    9   13 ... 7146 7157 7163]
1
[   0    1    4 ... 7160 7163 7164]
[   2    3    6 ... 7159 7161 7162]
2
[   0    2    3 ... 7161 7162 7163]
[   1    5    7 ... 7155 7160 7164]
3
[   1    2    3 ... 7162 7163 7164]
[   0    4   14 ... 7141 7144 7156]


In [8]:
max_words_text = train_data["text"].apply(lambda x: len(x.split())).max()
max_words_prompt_question = train_prompts["prompt_question"].apply(lambda x: len(x.split())).max()
max_words_prompt_text = train_prompts["prompt_text"].apply(lambda x: len(x.split())).max()

## max words
max_words_text, max_words_prompt_question, max_words_prompt_text

(647, 27, 966)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
cfg.tokenizer = tokenizer
cfg.tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DebertaV2TokenizerFast(name_or_path='./Models/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [10]:
train_df = train_data.merge(train_prompts, on='prompt_id')
train_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,fold,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,3.0,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


In [11]:
class TrainContent(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.pq = df['prompt_question'].values
        self.pt = df['prompt_title'].values
        self.text = df['text'].values
        self.targets = df['content'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        text =   self.text[index]
        pt = self.pt[index]
        # full_text = pq +" " + self.tokenizer.sep_token +" "+text
        # full_text = pt +" " + self.tokenizer.sep_token +" "+ pq + " " + self.tokenizer.sep_token + " " +text
        full_text = text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                        
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.target[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float)
    
class TrainWord(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.pq = df['prompt_question'].values
        self.pt = df['prompt_title'].values
        self.text = df['text'].values
        self.target = df['wording'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        text =   self.text[index]
        pt = self.pt[index]
        # full_text = pq +" " + self.tokenizer.sep_token +" "+text
        # full_text = pt +" " + self.tokenizer.sep_token +" "+ pq + " " + self.tokenizer.sep_token + " " +text
        full_text = text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                        
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.target[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float)

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs


In [12]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [13]:
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e9
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings

In [14]:
class MeanMax(nn.Module):
    def __init__(self):
        super(MeanMax, self).__init__()
        
        self.mean_pooler = MeanPooling()
        self.max_pooler  = MaxPooling()
        
    def forward(self, last_hidden_state, attention_mask):
        mean_pooler = self.mean_pooler( last_hidden_state ,attention_mask )
        max_pooler =  self.max_pooler( last_hidden_state ,attention_mask )
        out = torch.concat([mean_pooler ,max_pooler ] , 1)
        return out
    

In [15]:
class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

In [16]:
def get_pooling_layer():
    if cfg.pooling == 'Mean':
        return MeanPooling()
    
    elif cfg.pooling == 'Max':
        return MaxPooling()
    
    elif cfg.pooling == 'MeanMax':
        return MeanMax()
    
    elif cfg.pooling == 'GemText':
        return GeMText()


print(get_pooling_layer())

GeMText()


In [17]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

In [18]:
def odd_layer_freeze(module):
    for i in range(1,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def even_layer_freeze(module):
    for i in range(0,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def top_half_layer_freeze(module):
    for i in range(0,13,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

def bottom_half_layer_freeze(module):
    for i in range(13,14,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
    

In [19]:

'''
## Check layers which one are freeze 
for n,p in model.named_parameters():
    print(n,p.requires_grad)
'''

'\n## Check layers which one are freeze \nfor n,p in model.named_parameters():\n    print(n,p.requires_grad)\n'

In [20]:

#if cfg.freezing:
#    top_half_layer_freeze(model)

In [21]:
class BaselineModel(nn.Module):
    def __init__(self, model_name ):
        super(BaselineModel, self).__init__()
        
        self.model = AutoModel.from_pretrained(cfg.model_name)
        self.config = AutoConfig.from_pretrained(cfg.model_name)
        #self.drop = nn.Dropout(p=0.2)
        self.pooler = get_pooling_layer()

        if cfg.pooling == 'MeanMax':
            self.fc = nn.Linear(2*self.config.hidden_size, 1)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 1)
            
        
        self._init_weights(self.fc)
        
        if cfg.freezing:
            top_half_layer_freeze(self.model)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
           
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        #out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [22]:
def train_run(model ,criterion ,optimizer , dataloader):
    
    model.train()
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    running_loss = 0.0
    dataset_size = 0.0 
    
    
    for batch_idx , (data , labels) in bar:
        inputs , target = collate(data) , labels    
        ids  =  inputs['input_ids'].to(cfg.device, dtype = torch.long)
        mask = inputs['attention_mask'].to(cfg.device, dtype = torch.long)
        targets = target.to(cfg.device, dtype = torch.float)
        
        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        
        # normalize loss to account for batch accumulation
        loss = loss / cfg.accum_iter 
        loss.backward()
        
        if ((batch_idx + 1) % cfg.accum_iter == 0) or (batch_idx + 1 == len(dataloader)):
            optimizer.step()
            optimizer.zero_grad()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

    epoch_loss = running_loss/dataset_size
    gc.collect()
    

    
    return epoch_loss


In [23]:
@torch.no_grad()
def valid_run(model , dataloader):
    model.eval()
    
    running_loss = 0.0
    dataset_size = 0.0
    
    predictions = []
    y_labels = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for batch_idx , (data , labels) in bar:
        inputs , target = collate(data) , labels
        ids  =  inputs['input_ids'].to(cfg.device, dtype = torch.long)
        mask = inputs['attention_mask'].to(cfg.device, dtype = torch.long)
        targets = target.to(cfg.device, dtype = torch.float)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        predictions.append(outputs.detach().to('cpu').numpy())
        y_labels.append(labels.detach().to('cpu').numpy())
    
    predictions = np.concatenate(predictions)
    y_labels    = np.concatenate(y_labels)
    epoch_loss = running_loss / dataset_size
    gc.collect()   
    
    return epoch_loss , predictions , y_labels
        
    

In [24]:
def prepare_fold(fold):
    
    dftrain = train_df[train_df['fold']!= fold]
    dfvalid = train_df[train_df['fold']== fold]
    
    train_content_dataset = TrainContent(dftrain)
    valid_content_dataset = TrainContent(dfvalid)
    
    train_content_loader = DataLoader(train_content_dataset , batch_size=cfg.batch_size ,num_workers=2, shuffle=True, pin_memory=True)
    valid_content_loader = DataLoader(valid_content_dataset ,batch_size=cfg.batch_size,num_workers=2, shuffle=True, pin_memory=True)
    
    train_word_dataset = TrainWord(dftrain)
    valid_word_dataset = TrainWord(dfvalid)
    
    train_word_loader = DataLoader(train_word_dataset , batch_size=cfg.batch_size ,num_workers=2, shuffle=True, pin_memory=True)
    valid_word_loader = DataLoader(valid_word_dataset ,batch_size=cfg.batch_size,num_workers=2, shuffle=True, pin_memory=True)
    
    return train_word_loader , valid_word_loader, train_content_loader , valid_content_loader
    

In [25]:
def oof_df(option , true , pred):
    
    if option == 'word':
        df_pred = pd.DataFrame(pred ,columns= ['pred_wording'] )
        df_real = pd.DataFrame(true ,columns= ['wording'] )
        
        df = pd.concat([df_real , df_pred ],1)
    
    if option == 'content':
        df_pred = pd.DataFrame(pred ,columns= ['pred_content'] )
        df_real = pd.DataFrame(true ,columns= ['content'] )
        
        df = pd.concat([df_real , df_pred ],1)

    
    return df

In [26]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
              'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

In [27]:
oof_dfs_word = []
oof_dfs_content = []

for n_fold in range(cfg.fold):
    LOGGER.info('\n')
    LOGGER.info(f"========== fold: {n_fold} training ==========")
    train_word_loader , valid_word_loader, train_content_loader , valid_content_loader = prepare_fold(fold=n_fold)

    modelword  = BaselineModel(cfg.model_name).to(cfg.device)   
    modelcontent  = BaselineModel(cfg.model_name).to(cfg.device)   

    optimizer_parameters_word = get_optimizer_params(modelword,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)
    
    optimizer_parameters_content = get_optimizer_params(modelcontent,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)

    optimizer_word = AdamW(optimizer_parameters_word, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)
    optimizer_content = AdamW(optimizer_parameters_content, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)

    scheduler_word = lr_scheduler.CosineAnnealingLR(optimizer_word,T_max=cfg.T_max, eta_min=cfg.min_lr)    
    scheduler_content = lr_scheduler.CosineAnnealingLR(optimizer_content,T_max=cfg.T_max, eta_min=cfg.min_lr)
   
    criterion = nn.SmoothL1Loss(reduction='mean')
    
    start = time.time()
    best_epoch_score = np.inf
    
    for epoch in range(cfg.num_epoch):
        
        train_loss  = train_run(modelword ,criterion ,optimizer_word , dataloader=train_word_loader)
        valid_loss , valid_preds , valid_labels  = valid_run(modelword , dataloader=valid_word_loader)
        
        if valid_loss < best_epoch_score:
            
            LOGGER.info(f"Validation Loss Improved ({best_epoch_score} ---> {valid_loss})")
            best_epoch_score = valid_loss
            ### saving weights
            torch.save(modelword.state_dict(), f"{cfg.only_model_name}_word_Fold_{n_fold}.pth") 
            
            ## saving oof values
            df_word = oof_df('word', valid_labels , valid_preds)
            
            LOGGER.info(f'Weights and oof values saved for epochs-{epoch} .....')
            
        LOGGER.info(f"Epoch {epoch} Training Loss {np.round(train_loss , 4)} Validation Loss {np.round(valid_loss , 4)}")
    
    for epoch in range(cfg.num_epoch):
        
        train_loss  = train_run(modelcontent ,criterion ,optimizer_content , dataloader=train_content_loader)
        valid_loss , valid_preds , valid_labels  = valid_run(modelcontent , dataloader=valid_content_loader)
        
        if valid_loss < best_epoch_score:
            
            LOGGER.info(f"Validation Loss Improved ({best_epoch_score} ---> {valid_loss})")
            best_epoch_score = valid_loss
            ### saving weights
            torch.save(modelcontent.state_dict(), f"{cfg.only_model_name}_content_Fold_{n_fold}.pth") 
            
            ## saving oof values
            df_content = oof_df('content' , valid_labels , valid_preds)
            
            LOGGER.info(f'Weights and oof values saved for epochs-{epoch} .....')
            
        LOGGER.info(f"Epoch {epoch} Training Loss {np.round(train_loss , 4)} Validation Loss {np.round(valid_loss , 4)}")
        
    end = time.time()
    time_elapsed = end - start
    
    LOGGER.info(' Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    
    LOGGER.info(" Best Loss: {:.4f}".format(best_epoch_score))
    
    oof_dfs_word.append(df_word)
    oof_dfs_content.append(df_content)            
            
    LOGGER.info(f" oof for fold {n_fold} ---> {score_loss(valid_labels, valid_preds )}")
    del modelword, train_word_loader, valid_word_loader , df_word , valid_preds , valid_labels
    del modelcontent, train_content_loader, valid_content_loader , df_content

    gc.collect()
    LOGGER.info('\n')



  1%|▏         | 38/2554 [00:02<02:44, 15.34it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 9.77 GiB total capacity; 5.15 GiB already allocated; 54.06 MiB free; 5.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
oof_df_word = pd.concat(oof_dfs_word , ignore_index=True )
oof_df_content = pd.concat(oof_dfs_content , ignore_index=True )

#oof_df.to_csv('oof_df.csv' , index = False)

In [None]:
y_trues_final = np.array([oof_df["content"].to_list(), oof_df['wording'].to_list()]).T
y_preds_final  = np.array([oof_df["pred_content"].to_list(), oof_df["pred_wording"].to_list()]).T

In [None]:
print(MCRMSE(y_trues_final, y_preds_final))