In [1]:
import numpy as np 
import pandas as pd
import torch
import os
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GroupKFold
from transformers import AutoModel, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print(f'cuda? {torch.cuda.is_available()}')
device = "cuda" if torch.cuda.is_available() else "cpu"



/kaggle/input/deberta-v3-large/deberta-v3-large/spm.model
/kaggle/input/deberta-v3-large/deberta-v3-large/config.json
/kaggle/input/deberta-v3-large/deberta-v3-large/README.md
/kaggle/input/deberta-v3-large/deberta-v3-large/tf_model.h5
/kaggle/input/deberta-v3-large/deberta-v3-large/tokenizer_config.json
/kaggle/input/deberta-v3-large/deberta-v3-large/pytorch_model.bin
/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv
cuda? True


In [2]:
input_path = "/kaggle/input/commonlit-evaluate-student-summaries/"


output_path = "/kaggle/working/"
df_train_pro_file = pd.read_csv(input_path + 'prompts_train.csv')
df_train_sum_file = pd.read_csv(input_path + 'summaries_train.csv')
df_test_pro_file = pd.read_csv(input_path + 'prompts_test.csv')
df_test_sum_file = pd.read_csv(input_path + 'summaries_test.csv')

In [3]:
cfg_effort = 'big'

#model_path = "../input/debertav3small/"
model_path = "../input/deberta-v3-large/deberta-v3-large"

cfg_folds = 2

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, local_files_only=True)

def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
set_config(transform_output="pandas")

# transform text to add prompt and question
columns = ["text", "content", "wording", "student_id", "prompt_id"] 

def create_input(df):
    s = tokenizer.sep_token
    df['text'] = df["prompt_title"] + s + df["prompt_question"] + s + df["text"] # no "prompt_text"]
    return df

df_train = pd.merge(df_train_pro_file, df_train_sum_file, how='left', on='prompt_id')
df_train = create_input(df_train)
df_train = df_train[columns]

# create folds for cv
group_kfold = GroupKFold(n_splits=4)
groups = df_train["prompt_id"]
df_train['fold'] = ''

for f, (train_index, test_index) in enumerate(group_kfold.split(df_train, None, groups)):
    print(f"Fold {f}: {train_index[:5]}, {test_index[:5]}")
    print(f"groups in Train: index={train_index[:5]}, group={set([groups[i] for i in train_index])}")
    print(f"groups in Test: index={test_index[:5]}, group={set([groups[i] for i in test_index])}")
    df_train.loc[test_index, 'fold'] = f 
    
set([(r,p) for r,p in df_train[["fold", "prompt_id"]].values])

Fold 0: [2057 2058 2059 2060 2061], [0 1 2 3 4]
groups in Train: index=[2057 2058 2059 2060 2061], group={'ebad26', '814d6b', '3b9047'}
groups in Test: index=[0 1 2 3 4], group={'39c16e'}
Fold 1: [0 1 2 3 4], [2057 2058 2059 2060 2061]
groups in Train: index=[0 1 2 3 4], group={'39c16e', 'ebad26', '814d6b'}
groups in Test: index=[2057 2058 2059 2060 2061], group={'3b9047'}
Fold 2: [0 1 2 3 4], [5169 5170 5171 5172 5173]
groups in Train: index=[0 1 2 3 4], group={'39c16e', '814d6b', '3b9047'}
groups in Test: index=[5169 5170 5171 5172 5173], group={'ebad26'}
Fold 3: [0 1 2 3 4], [4066 4067 4068 4069 4070]
groups in Train: index=[0 1 2 3 4], group={'39c16e', 'ebad26', '3b9047'}
groups in Test: index=[4066 4067 4068 4069 4070], group={'814d6b'}


{(0, '39c16e'), (1, '3b9047'), (2, 'ebad26'), (3, '814d6b')}

In [6]:
ds = Dataset.from_pandas(df_train)

df_test = pd.merge(df_test_pro_file, df_test_sum_file, how='left', on='prompt_id')
df_test['content'] = ''
df_test['wording'] = ''
df_test = create_input(df_test)
df_test = df_test[columns]

dataset_big = DatasetDict({'train': ds, 'submission': Dataset.from_pandas(df_test)})
print(dataset_big)
dataset_big['submission']['text'][:1][0][:512]

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'fold'],
        num_rows: 7165
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 4
    })
})


'Example Title 1[SEP]Summarize...[SEP]Example text 1'

In [7]:
dataset = None

if cfg_effort == 'big':
    dataset = dataset_big
else:
    ds_folds = concatenate_datasets([
        dataset_big['train'].filter(lambda x: x['fold'] == 0).shuffle().select(range(25)), 
        dataset_big['train'].filter(lambda x: x['fold'] == 1).shuffle().select(range(22)),
        dataset_big['train'].filter(lambda x: x['fold'] == 2).shuffle().select(range(24)),
        dataset_big['train'].filter(lambda x: x['fold'] == 3).shuffle().select(range(21))
    ])
    dataset = DatasetDict({'train': ds_folds, 'submission': dataset_big['submission']})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'fold'],
        num_rows: 7165
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 4
    })
})

In [8]:
def get_MCRMSE_score(eval_pred):
    # columnwise root mean squared error
    preds, labels = eval_pred
    print(f"Computing MCRMSE score for preds: {len(preds)} and labels {len(labels)}")
         
    by_column = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(by_column)
    return mcrmse

r = (np.array([[0.2, 0.4, 0.2], [1, 0.4, 1]]), np.array([[0.2, 0.2, 0.2], [1, 1, 1]]))
print(get_MCRMSE_score(r))

Computing MCRMSE score for preds: 2 and labels 2
0.14907119849998599


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    
    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [10]:
class OneLabelRegressor:
    def __init__(self, target, tokenizer, dataset):
        self.target = target
        self.trainer = None
        self.tokenizer = tokenizer
        self.data_collator = DataCollatorWithPadding(tokenizer)
        self.dataset = dataset
        
        #self.model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-xsmall', num_labels=1, problem_type="regression").to(device) # 1 for regression
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path, 
            num_labels=1, 
            problem_type="regression",
            hidden_dropout_prob=0.005,
            attention_probs_dropout_prob=0.005
        ).to(device)
        
        
        self.training_args = TrainingArguments(
            output_dir ='/kaggle/working' + self.target,          
            num_train_epochs = 3,     
            per_device_train_batch_size = 3,   # 16 for deberta-V3-small
            per_device_eval_batch_size = 3, 
            weight_decay = 0.021,               
            learning_rate = 1.5e-5,
            save_total_limit = 10,
            logging_strategy = "epoch",
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            report_to="none",
        ) 
    
    def run_fold(self, fold):
        # select fold
        
        tokens_train = self.dataset['train'].filter(lambda x: x['fold'] != fold).map(tokenize_function, batched=True)
        tokens_test =  self.dataset['train'].filter(lambda x: x['fold'] == fold).map(tokenize_function, batched=True)
        
        if cfg_folds == 2:
            if fold == 0:
                tokens_train = self.dataset['train'].filter(lambda x: x['fold'] < 2).map(tokenize_function, batched=True)
                tokens_test =  self.dataset['train'].filter(lambda x: x['fold'] >= 2).map(tokenize_function, batched=True)
            else:
                tokens_train = self.dataset['train'].filter(lambda x: x['fold'] >= 2).map(tokenize_function, batched=True)
                tokens_test =  self.dataset['train'].filter(lambda x: x['fold'] < 2).map(tokenize_function, batched=True)
        
        tokens_train = tokens_train.with_format(type='torch')
        tokens_test = tokens_test.with_format(type='torch')
        
        self.trainer = Trainer(
            model=self.model, 
            args=self.training_args, 
            train_dataset=tokens_train,
            eval_dataset=tokens_test,
            #tokenizer = self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )
        self.trainer.train()
        # predict on train
        eval_preds = self.trainer.predict(tokens_test).predictions.squeeze()
        eval_labels = self.dataset['train'].filter(lambda x: x['fold'] == fold)["labels"]
        
        if cfg_folds == 2:
            if fold == 0:
                eval_labels = self.dataset['train'].filter(lambda x: x['fold'] >= 2)["labels"]
            else:
                eval_labels = self.dataset['train'].filter(lambda x: x['fold'] < 2)["labels"]
              
        return eval_preds, eval_labels, self.trainer.evaluate()
        
        
    def run(self):
        self.dataset = self.dataset.rename_column(self.target, "labels") # because the model expects it
        rmse = 0
        loss = 0
        eval_subs = [0.0 for i in range(len(self.dataset['submission']))]
        #eval_preds = [0.0 for i in range(len(self.dataset['train']))]
        eval_preds = []
        eval_labels = []
        
        for f in range(cfg_folds):
            print()
            print(f"Training fold {f} for target {self.target} now")
            preds, labels, eval_metrics = self.run_fold(f) 
            print(f"Fold {f} trained. Returns {len(preds)} preds with {len(labels)} labels.")
            #for i in range(len(preds)):
            #    eval_preds[i + f * len(preds)] += preds[i]
            eval_preds.extend(preds)
            eval_labels.extend(labels)
            print(f"After fold {f} {len(eval_preds)} preds accumulated and {len(eval_labels)} labels.")
            rmse += eval_metrics['eval_rmse']
            loss += eval_metrics['eval_loss']

        # predict on submission
        tokens_sub =  self.dataset['submission'].map(tokenize_function, batched=True)
        tokens_sub = tokens_sub.with_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
        eval_subs = self.trainer.predict(tokens_sub).predictions.squeeze()
        print(f"Predicted submission for target {self.target} : {eval_subs}")  

        self.dataset = self.dataset.rename_column("labels", self.target)        
        
        return eval_subs, (eval_preds, eval_labels), {'eval_rmse': rmse / 4, 'eval_loss': loss / 4}       

In [11]:
metrics_set = {}
subs_set = {}
preds_set = {}

for t in ['content', 'wording']:
    r = OneLabelRegressor(t, tokenizer, dataset)
    subs_set[t], preds_set[t], metrics_set[t] = r.run()
    #metrics_set.append(pd.DataFrame([[k, v] for k,v in results.items()], columns=['metric', t]))
#eval_metrics = metrics_set[0].merge(metrics_set[1], left_index=True, right_index=True)
#eval_metrics

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/deberta-v3-large/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training fold 0 for target content now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.2866,0.318244,0.318244,0.564131,0.427315,0.709016,82.318717
2,0.1593,0.259847,0.259847,0.509752,0.379544,0.762411,76.30654
3,0.0881,0.252056,0.252056,0.502052,0.378089,0.769534,75.003066


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 0 trained. Returns 3099 preds with 3099 labels.
After fold 0 3099 preds accumulated and 3099 labels.

Training fold 1 for target content now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.2316,0.172285,0.172285,0.415073,0.299391,0.841218,60.651431
2,0.1469,0.113931,0.113931,0.337536,0.25678,0.894999,57.112492
3,0.0694,0.092739,0.092739,0.30453,0.227296,0.91453,52.811024


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 1 trained. Returns 4066 preds with 4066 labels.
After fold 1 7165 preds accumulated and 7165 labels.


  0%|          | 0/1 [00:00<?, ?ba/s]

Predicted submission for target content : [-1.6813595 -1.6832861 -1.6834652 -1.6884668]


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/deberta-v3-large/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training fold 0 for target wording now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.5047,0.597007,0.597007,0.772662,0.596752,0.479419,97.113595
2,0.2839,0.627403,0.627403,0.792088,0.611078,0.452915,98.782631
3,0.1579,0.671262,0.671262,0.819306,0.624622,0.41467,97.545559


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 0 trained. Returns 3099 preds with 3099 labels.
After fold 0 3099 preds accumulated and 3099 labels.

Training fold 1 for target wording now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.3808,0.228327,0.228327,0.477836,0.361453,0.774625,72.470533
2,0.209,0.172843,0.172843,0.415744,0.306907,0.829392,66.337502
3,0.1012,0.149285,0.149285,0.386374,0.284169,0.852645,62.039643


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 1 trained. Returns 4066 preds with 4066 labels.
After fold 1 7165 preds accumulated and 7165 labels.


  0%|          | 0/1 [00:00<?, ?ba/s]

Predicted submission for target wording : [-1.5072397 -1.5046117 -1.5083785 -1.5098203]


In [12]:
df_preds = pd.DataFrame()
df_labels = pd.DataFrame()
for t in ["content", "wording"]:
    df_preds[t] = preds_set[t][0]
    df_labels[t] = preds_set[t][1]
eval_pred = (df_preds.to_numpy(), df_labels.to_numpy())
score = get_MCRMSE_score(eval_pred)
print(f'Competition Score Test = {score} with RMSE content = {metrics_set["content"]["eval_rmse"]} and wording = {metrics_set["wording"]["eval_rmse"]}')

Computing MCRMSE score for preds: 7165 and labels 7165
Competition Score Test = 0.507233146834671 with RMSE content = 0.20164553076028824 and wording = 0.30142003297805786


- Competition Score Test = 0.9983877201707859 Train = 0.9334373137383113 with small dataset and text as input
- Competition Score Test = 1.0509906661769226 Train = 0.9430155933906172 with small dataset and text, prompt_title, prompt_question as input
- Competition Score Test = 0.9892124610593578 with same but CV by group
   - Competition Score Test = 0.8441502913515511 with RMSE content = 0.8443778902292252 and wording = 0.8357343822717667 with same
   - Competition Score Test = 1.0648084578659271 with RMSE content = 1.0498037487268448 and wording = 0.9934460520744324 with same
   - high variability!
- Competition Score Test = 0.9185786665608567 with RMSE content = 0.7520671784877777 and wording = 0.9256728887557983 with same but 3 to 5 epochs
- Competition Score Test = 1.2718997770601512 with RMSE content = 0.31187612004578114 and wording = 0.3034804631024599 with 5 epochs big dataset
- Competition Score Test = 0.5024914409907989 with RMSE content = 0.39404456689953804 and wording = 0.5353490635752678 with deberta-v3-large and 3 epochs small dataset

In [13]:
# submission
df_sub = dataset['submission'].to_pandas()
for t in ["content", "wording"]:
    df_sub[t] = subs_set[t]
df_sub = df_sub[['student_id', 'content', 'wording']]
df_sub

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.68136,-1.50724
1,222222cccccc,-1.683286,-1.504612
2,111111eeeeee,-1.683465,-1.508379
3,333333dddddd,-1.688467,-1.50982


In [14]:
df_sub.to_csv(output_path + 'submission.csv', index=False)