In [1]:
import numpy as np 
import pandas as pd
import torch
import os
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import AutoModel, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

print(f'cuda? {torch.cuda.is_available()}')
device = "cuda" if torch.cuda.is_available() else "cpu"

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



cuda? True
/kaggle/input/debertav3small/spm.model
/kaggle/input/debertav3small/config.json
/kaggle/input/debertav3small/README.md
/kaggle/input/debertav3small/tf_model.h5
/kaggle/input/debertav3small/tokenizer_config.json
/kaggle/input/debertav3small/pytorch_model.bin
/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv


In [2]:
input_path = "/kaggle/input/commonlit-evaluate-student-summaries/"
model_path = "../input/debertav3small/"
output_path = "/kaggle/working/"
df_train_pro_file = pd.read_csv(input_path + 'prompts_train.csv')
df_train_sum_file = pd.read_csv(input_path + 'summaries_train.csv')
df_test_pro_file = pd.read_csv(input_path + 'prompts_test.csv')
df_test_sum_file = pd.read_csv(input_path + 'summaries_test.csv')

In [3]:
set_config(transform_output="pandas")

df_train = pd.merge(df_train_pro_file, df_train_sum_file, how='left', on='prompt_id')
df_train = df_train[["text", "content", "wording", "student_id", "prompt_id"]]

ds = Dataset.from_pandas(df_train)
dt, dv = ds.train_test_split(test_size=0.1, shuffle=True).values()

df_test = pd.merge(df_test_pro_file, df_test_sum_file, how='left', on='prompt_id')
df_test['content'] = ''
df_test['wording'] = ''
df_test = df_test[["text", "content", "wording", "student_id", "prompt_id"]]

dataset = DatasetDict({'train': dt,'test': dv, 'submission':Dataset.from_pandas(df_test)})
dataset


#scaler = ColumnTransformer([("scaled", StandardScaler(), ['content', 'wording'])], remainder='passthrough')
#df = scaler.fit_transform(df_merged)
#df = df[['remainder__text', 'scaled__content', 'scaled__wording']]
#df = df.rename({'remainder__text': 'text', 'scaled__content': 'content', 'scaled__wording': 'wording'}, axis=1)
#df.sample(1)

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 6448
    })
    test: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 717
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 4
    })
})

In [4]:
def get_MCRMSE_score(eval_pred):
    # columnwise root mean squared error
    preds, labels = eval_pred
         
    by_column = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(by_column)
    return mcrmse

r = (np.array([[0.2, 0.4, 0.2], [1, 0.4, 1]]), np.array([[0.2, 0.2, 0.2], [1, 1, 1]]))
print(get_MCRMSE_score(r))

0.14907119849998599


In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    mcrmse = get_MCRMSE_score(eval_pred)
    
    return {"mcrmse": mcrmse, "mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [6]:
#tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, local_files_only=True)

def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    
tokenized = dataset.map(tokenize_function, batched=True)
tokenized = tokenized.with_format(type='torch')
data_collator = DataCollatorWithPadding(tokenizer)
tokenized

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6448
    })
    test: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 717
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4
    })
})

In [7]:
class OneLabelRegressor:
    def __init__(self, target, tokens, collator):
        self.target = target
        self.tokenized = tokens
        self.collator = collator
        self.model = None
        self.training_args = None
        self.trainer = None

    def run(self, effort='small'):
              
        #self.model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-xsmall', num_labels=1, problem_type="regression").to(device) # 1 for regression
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, problem_type="regression").to(device)
        
        self.training_args = TrainingArguments(
            output_dir ='/kaggle/working' + self.target,          
            num_train_epochs = 3,     
            per_device_train_batch_size = 16,   
            per_device_eval_batch_size = 16, 
            weight_decay = 0.021,               
            learning_rate = 1.5e-5,
            save_total_limit = 10,
            logging_strategy = "epoch",
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            report_to="none",
        ) 
        
        small_train_dataset = self.tokenized["train"].shuffle().select(range(100))
        small_test_dataset = self.tokenized["test"].shuffle().select(range(100))
        
        if effort == 'big':
            self.trainer = Trainer(
                model=self.model, 
                args=self.training_args, 
                train_dataset=self.tokenized['train'],
                eval_dataset=self.tokenized['test'],
                compute_metrics=compute_metrics,
                data_collator=self.collator
            )
        else:
            self.trainer = Trainer(
                model=self.model, 
                args=self.training_args, 
                train_dataset=small_train_dataset,
                eval_dataset=small_test_dataset,
                compute_metrics=compute_metrics,
                data_collator=self.collator
            )
        self.trainer.train()
        
        
        

In [8]:
print(tokenized)
def training(target, tokenized, data_collator, effort="small"):
    tokenized = tokenized.rename_column(target, "labels") # because the model expects it
    r = OneLabelRegressor(target, tokenized, data_collator)
    r.run(effort)
    tokenized = tokenized.rename_column('labels', target)
    return r, pd.DataFrame([[k, v] for k,v in r.trainer.evaluate().items()], columns=['metric', target])

runner_c, result_c = training('content', tokenized, data_collator, 'big')

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6448
    })
    test: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 717
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4
    })
})


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/debertav3small/ and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mcrmse,Mse,Rmse,Mae,R2,Smape
1,0.3277,0.193111,1.371561,0.193111,0.439444,0.330302,0.826541,67.883488
2,0.1846,0.191994,1.401463,0.191994,0.438171,0.336945,0.827545,68.278733
3,0.1543,0.170097,1.393525,0.170097,0.412429,0.313206,0.847213,65.016262


In [9]:
runner_w, result_w = training('wording', tokenized, data_collator, 'big')

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/debertav3small/ and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mcrmse,Mse,Rmse,Mae,R2,Smape
1,0.4699,0.343823,1.354171,0.343823,0.586365,0.459849,0.689349,84.286644
2,0.3085,0.324206,1.381006,0.324206,0.569391,0.445639,0.707074,82.448767
3,0.2552,0.307636,1.361855,0.307636,0.554649,0.435619,0.722045,82.695182


In [10]:
results = result_c.merge(result_w['wording'], left_index=True, right_index=True)
results

Unnamed: 0,metric,content,wording
0,eval_loss,0.170097,0.307636
1,eval_mcrmse,1.393525,1.361855
2,eval_mse,0.170097,0.307636
3,eval_rmse,0.412429,0.554649
4,eval_mae,0.313206,0.435619
5,eval_r2,0.847213,0.722045
6,eval_smape,65.016262,82.695182
7,eval_runtime,10.488,10.3536
8,eval_samples_per_second,68.364,69.251
9,eval_steps_per_second,4.291,4.346


In [11]:
# submission
df_sub = tokenized['submission'].to_pandas()
df_sub['content'] = runner_c.trainer.predict(tokenized['submission']).predictions
df_sub['wording'] = runner_w.trainer.predict(tokenized['submission']).predictions
df_sub = df_sub[['student_id', 'content', 'wording']]
df_sub

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.573557,-1.31111
1,222222cccccc,-1.602774,-1.307745
2,111111eeeeee,-1.569436,-1.299939
3,333333dddddd,-1.581009,-1.29319


In [12]:
df_sub.to_csv(output_path + 'submission.csv', index=False)

In [13]:
#!cat /kaggle/working/submission.csv

In [14]:
#TODO compute metric truly
#TODO