In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import gc
import torch.nn as nn

from sklearn.model_selection import KFold, GroupKFold
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm


In [2]:
class config:
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005
    num_train_epochs=1
    n_splits=4
    batch_size=8
    random_seed=42
    save_steps=100
    max_length=512
    hidden_size = 512

In [3]:
# prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
# train_data = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')

train_data = pd.read_csv('./Data/summaries_train.csv')

In [4]:
train_data.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [5]:
gkf = GroupKFold(n_splits = config.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train_data, groups = train_data['prompt_id'])):
    train_data.loc[val_index, 'fold'] = i
    
train_data.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,3.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,2.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,3.0


In [6]:
# tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/hugging-face-models-safe-tensors/bert-base-uncased')
# bert_model = AutoModel.from_pretrained('/kaggle/input/hugging-face-models-safe-tensors/bert-base-uncased')

tokenizer = AutoTokenizer.from_pretrained('./Models/bert-base-uncased')
bert_model = AutoModel.from_pretrained('./Models/bert-base-uncased')

In [7]:
class TorchDataConversion(torch.utils.data.Dataset):
    def __init__(self, encodings, label_contents, label_words):
        self.encodings = encodings
        self.label_contents = label_contents
        self.label_words = label_words
        
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings["input_ids"][idx]).int()
        masks = torch.tensor(self.encodings["attention_mask"][idx]).int()
        contents = torch.tensor([self.label_contents[idx]]).float()
        words = torch.tensor([self.label_words[idx]]).float()
        
        return {
            'input_ids': input_ids,
            'attention_mask': masks,
            'contents': contents,
            'words': words
        } 
    
    def __len__(self):
        return len(self.label_contents)
        

In [8]:
class MultiOutputBertModel(nn.Module):
    def __init__(self, bert_model, hidden_size):
        super(MultiOutputBertModel, self).__init__()
        self.bert = bert_model
        self.fc1 = nn.Linear(bert_model.config.hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2_content = nn.Linear(hidden_size, 1)
        self.fc2_wording = nn.Linear(hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask = attention_mask)
        pooled_output = outputs.pooler_output
        x = self.fc1(pooled_output)
        x = self.relu(x)
        content_output = self.fc2_content(x)
        wording_output = self.fc2_wording(x)
        return content_output, wording_output

In [9]:
# Cross Val metric

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [10]:
# test dimensions of the mertic calc function
"""test_eval_pred = np.random.rand(2,3,2)
score = compute_mcrmse(test_eval_pred)
score"""

'test_eval_pred = np.random.rand(2,3,2)\nscore = compute_mcrmse(test_eval_pred)\nscore'

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiOutputBertModel(bert_model, config.hidden_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = config.learning_rate)
criterion = nn.MSELoss().to(device)

In [12]:
for fold in range(config.n_splits):
    print(f"Fold {fold}")
    # Split train and val dataset
    train_set = train_data[train_data["fold"] != fold]
    val_set = train_data[train_data["fold"] == fold]

    train_encodings = tokenizer.batch_encode_plus(train_set['text'].to_list(), max_length = config.max_length, padding = 'max_length', truncation = True)
    train_dataset = TorchDataConversion(train_encodings, train_set['content'].ravel(), train_set['wording'].ravel())
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = config.batch_size, shuffle = True)

    val_encodings = tokenizer.batch_encode_plus(val_set['text'].to_list(), max_length = config.max_length, padding = 'max_length', truncation = True)
    val_dataset = TorchDataConversion(val_encodings, val_set['content'].ravel(), val_set['wording'].ravel())
    val_loader = torch.utils.data.DataLoader(val_dataset)

    # Train model on train set
    for epoch in range(config.num_train_epochs):
        model.train()
        running_loss = 0.0
        with tqdm(train_loader, unit = "batch", leave = False) as t:
            for i, item in enumerate(t):
                batch_input_ids = item["input_ids"].to(device)
                batch_att_mask = item["attention_mask"].to(device)
                batch_content = item["contents"].to(device)
                batch_wording = item["words"].to(device)
                
                optimizer.zero_grad()
                content_pred, wording_pred = model(batch_input_ids, batch_att_mask)
                
                content_loss = criterion(content_pred.squeeze(), batch_content.squeeze())
                wording_loss = criterion(wording_pred.squeeze(), batch_wording.squeeze())
                
                loss = content_loss + wording_loss
                running_loss += loss.item()
                
                loss.backward()
                optimizer.step()
                
                t.set_postfix({"loss": loss.item()})
    
        average_loss = running_loss/len(train_loader)
        print(f"Epoch [{epoch + 1}/{config.num_train_epochs}] - Average Loss: {average_loss:.4f}")
    
    print(f"Fold {fold} Training complete!")

    # Validate using validate set
    model.eval()
    val_loss = 0.0
    content_pred = []
    wording_pred = []
    content_true = []
    wording_true = []

    with torch.no_grad():
        for i, item in enumerate(val_loader):
            input_ids = item["input_ids"].to(device)
            att_mask = item["attention_mask"].to(device)
            content_true.append(item["contents"].cpu().item())
            wording_true.append(item["words"].cpu().item())

            content_pred_temp, wording_pred_temp = model(input_ids, att_mask)
            content_pred.append(content_pred_temp.cpu().item())
            wording_pred.append(wording_pred_temp.cpu().item())

    eval_pred = [[content_pred, wording_pred],[content_true, wording_true]]
    eval_pred = np.array(eval_pred)
    eval_output = compute_mcrmse(eval_pred)

    print(f"Fold {fold} eval results:")
    print(f"Content_rmse: {eval_output['content_rmse']}")
    print(f"Word_rmse: {eval_output['wording_rmse']}")
    print(f"MCR_rmse: {eval_output['mcrmse']}")



Fold 0


                                                                

Epoch [1/1] - Average Loss: 0.7970
Fold 0 Training complete!
Fold 0 eval results:
Content_rmse: 0.5203979151094575
Word_rmse: 0.08938121875986993
MCR_rmse: 0.5164702001854223
Fold 1


                                                                

Epoch [1/1] - Average Loss: 0.4895
Fold 1 Training complete!
Fold 1 eval results:
Content_rmse: 1.469008323074167
Word_rmse: 0.4668046952272492
MCR_rmse: 0.4746719103736513
Fold 2


                                                                 

Epoch [1/1] - Average Loss: 0.4350
Fold 2 Training complete!
Fold 2 eval results:
Content_rmse: 0.518462826239734
Word_rmse: 0.7259593304867207
MCR_rmse: 0.3490696404476706
Fold 3


                                                                 

Epoch [1/1] - Average Loss: 0.3281
Fold 3 Training complete!
Fold 3 eval results:
Content_rmse: 0.4034974255968382
Word_rmse: 0.7360099567355907
MCR_rmse: 0.33246208620738915


In [13]:
# test_data = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
test_data = pd.read_csv('./Data/summaries_test.csv')

In [14]:
test_data.head()

Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


In [15]:
class TorchDataConversion_test(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings["input_ids"][idx]).int()
        masks = torch.tensor(self.encodings["attention_mask"][idx]).int()
        
        return {
            'input_ids': input_ids,
            'attention_mask': masks,
        } 
    
    def __len__(self):
        return len(self.encodings['input_ids'])  

In [16]:
test_encodings = tokenizer.batch_encode_plus(test_data['text'].to_list(), max_length = config.max_length, padding = 'max_length', truncation = True)
test_dataset = TorchDataConversion_test(test_encodings)
test_loader = torch.utils.data.DataLoader(test_dataset)

In [17]:
model.eval()
content_score = []
wording_score = []
with torch.no_grad():
    for i, item in enumerate(test_loader):
        input_ids = item["input_ids"].to(device)
        att_mask = item["attention_mask"].to(device)
        
        content_pred, wording_pred = model(input_ids, att_mask)
        content_score.append(content_pred.cpu().item())
        wording_score.append(wording_pred.cpu().item())

In [18]:
submission_df = pd.DataFrame({
    'student_id': test_data['student_id'],
    'content': content_score,
    'wording': wording_score
})

submission_df.to_csv('submission.csv', index= False)

In [19]:
submission_df.head()

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.272173,-1.226621
1,111111eeeeee,-1.276924,-1.224973
2,222222cccccc,-1.273461,-1.223927
3,333333dddddd,-1.282219,-1.236035
