In [None]:
import os
import gc
from tqdm.auto import tqdm
import transformers
import numpy as np 
import pandas as pd 
from torch.nn.parameter import Parameter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import plotly.express as px #graphing
import plotly.graph_objects as go #graphing
from plotly.subplots import make_subplots #graphing
import plotly.figure_factory as ff #graphing

from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification


from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
from datasets import concatenate_datasets,load_dataset,load_from_disk

from sklearn.metrics import log_loss

from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import time
import warnings
import collections
from termcolor import colored

from torch.optim import lr_scheduler
warnings.filterwarnings("ignore")



In [None]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

In [None]:
oof_df = pd.read_csv('/kaggle/input/commonlit-deberta-base/oof_df.csv')
#oof_df = pd.read_csv('/kaggle/input/commonlit-deberta-hidden-layers-mean/oof_df.csv')
print(oof_df.shape)
oof_df.head()

In [None]:
s = score_loss(np.array(oof_df[['content' , 'wording']]) , np.array(oof_df[['pred_content' , 'pred_wording']]))
s

In [None]:
import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging
)

In [None]:
# ====================================================
# Utils
# ====================================================
class cfg:
    select = 'base'
    model_name = f'/kaggle/input/deberta-v3-{select}/deberta-v3-{select}'
    only_model_name = f'deberta-v3-{select}'
    fold = 4
    batch_size = 32
    freezing = False
    max_len = 1024
    pooling = 'GemText'
    path = '/kaggle/input/commonlit-deberta-base'
    #path = '/kaggle/input/commonlit-deberta-hidden-layers-mean/'
    #path = '/kaggle/input/commonlit-baseline/'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    seed = 42
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.path+'/tokenizer/')
cfg.device , cfg.model_name

In [None]:

def get_logger(filename='Inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg.seed)


LOGGER.info(f"max_len: {cfg.max_len}")
LOGGER.info(f"batch_size: {cfg.batch_size}")
LOGGER.info(f"Model name: {cfg.only_model_name}")

In [None]:
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')

print(f"Prompts Test shape: {prompts_test.shape}")
display(prompts_test.head())
print(f"Summary Test shape: {summary_test.shape}")
display(summary_test.head())
print(f"Submission shape: {submission.shape}")
display(submission.head())

In [None]:
test = prompts_test.merge(summary_test, on="prompt_id")
test

In [None]:
test['full_text']=test['prompt_question'] +" " + cfg.tokenizer.sep_token +" "+test['text']
test['full_text']

In [None]:
def odd_layer_freeze(module):
    for i in range(1,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def even_layer_freeze(module):
    for i in range(0,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def top_half_layer_freeze(module):
    for i in range(0,13,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

def bottom_half_layer_freeze(module):
    for i in range(13,14,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
    
class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret



def get_pooling_layer():
    if cfg.pooling == 'Mean':
        return MeanPooling()
    
    elif cfg.pooling == 'Max':
        return MaxPooling()
    
    elif cfg.pooling == 'MeanMax':
        return MeanMaxPooling()
    
    elif cfg.pooling == 'GemText':
        return GeMText()


print(get_pooling_layer())

In [None]:
class BaselineModel(nn.Module):
    def __init__(self, model_name ):
        super(BaselineModel, self).__init__()
        
        self.model = AutoModel.from_pretrained(cfg.model_name)
        self.config = AutoConfig.from_pretrained(cfg.model_name)
        #self.drop = nn.Dropout(p=0.2)
        self.pooler = get_pooling_layer()

        if cfg.pooling == 'MeanMax':
            self.fc = nn.Linear(2*self.config.hidden_size, 2)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 2)
            
        
        self._init_weights(self.fc)
        
        if cfg.freezing:
            top_half_layer_freeze(self.model)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
           
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        #out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [None]:
'''
class deberta_hs_Mean(nn.Module):
    def __init__(self, model_name ):
        super(deberta_hs_Mean, self).__init__()
        
        self.model_config = AutoConfig.from_pretrained(cfg.model_name)
        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": 0,
                "add_pooling_layer": False,
                "num_labels": 2,
                 "attention_probs_dropout_prob":0.0 
            }
        )
        self.model = AutoModel.from_pretrained(cfg.model_name, config=self.model_config)
        
        self.fc = nn.Linear(self.model_config.hidden_size, 2)
        
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def forward(self, ids, mask):
        outputs = self.model(input_ids=ids,attention_mask=mask)
        
        h1 = outputs[1][-1][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h2 = outputs[1][-2][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h3 = outputs[1][-3][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h4 = outputs[1][-4][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h5 = outputs[1][-5][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h6 = outputs[1][-6][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h7 = outputs[1][-7][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h8 = outputs[1][-8][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h9 = outputs[1][-9][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h10 = outputs[1][-10][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h11 = outputs[1][-11][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        h12 = outputs[1][-12][:,0,:].reshape(-1,1,self.model_config.hidden_size)
        
        all_cat_mean = torch.mean(
        torch.cat([ h1, h2, h3, h4, h5, h6,h7,h8,h9,h10,h11,h12], 1)
        ,1).reshape(-1,1,self.model_config.hidden_size)
        
        seq_out = torch.mean(torch.cat([ outputs.last_hidden_state[:,0,:].reshape(-1,1,self.model_config.hidden_size),
        all_cat_mean] , 1),1)
        
        out = self.fc(seq_out)
        return out
'''

In [None]:
'''
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class BaselineModel(nn.Module):
    def __init__(self, model_name):
        super(BaselineModel, self).__init__()
        
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        #self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 2)
        
        self._init_weights(self.fc)        
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
           
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        #out = self.drop(out)
        outputs = self.fc(out)
        return outputs

'''

In [None]:
class TestDataset(Dataset):
    def __init__(self,df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.pq = df['prompt_question'].values
        self.pt = df['prompt_title'].values
        self.text = df['text'].values
        self.title = df['prompt_title'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        text =   self.text[index]
        pt = self.pt[index]
        # full_text = pq+" " + self.tokenizer.sep_token +" "+text
        # full_text = pt +" " + self.tokenizer.sep_token +" "+ pq + " " + self.tokenizer.sep_token + " " +text
        full_text=text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                        
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long)
        }
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [None]:
@torch.no_grad()
def test_run(model , loader):
    
    
    model.eval()
       
    preds = []
    bar = tqdm(enumerate(loader), total=len(loader))
    for idx , data in bar:
        inputs = collate(data)
        ids   =  inputs['input_ids'].to(cfg.device, dtype = torch.long)
        mask  =  inputs['attention_mask'].to(cfg.device, dtype = torch.long)
        y_preds = model(ids , mask)
        preds.append(y_preds.to('cpu').numpy())
    
    predictions = np.concatenate(preds)
    
    return predictions
    

In [None]:
test_dataset = TestDataset(test)
test_loader = DataLoader(test_dataset , batch_size=cfg.batch_size ,num_workers=2, shuffle=False, pin_memory=True)

In [None]:
final_preds = []
for fold in range(cfg.fold):
    print('******** fold' , fold , '********')
    
    model  = BaselineModel(cfg.model_name).to(cfg.device)
    model.load_state_dict(torch.load(f"/kaggle/input/commonlit-deberta-base/deberta-v3-{cfg.select}_Fold_{fold}.pth", map_location=torch.device('cpu')))
    #model.load_state_dict(torch.load(f"/kaggle/input/commonlit-deberta-hidden-layers-mean/deberta-v3-base_Fold_{fold}.pth", map_location=torch.device('cpu')))
    preds = test_run(model, test_loader)
    final_preds.append(preds)
    del model ; gc.collect()
    torch.cuda.empty_cache()
    # /kaggle/input/commonlit-deberta-base/deberta-v3-large_Fold_0.pth
final_preds_ = np.mean(final_preds, axis=0)
    

In [None]:
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
submission.head()

In [None]:
target_cols=['content','wording']
test[target_cols] = final_preds_
submission = submission.drop(columns=target_cols).merge(test[['student_id'] + target_cols], on='student_id', how='left')
display(submission.head())
submission[['student_id'] + target_cols].to_csv('submission.csv', index=False)