In [1]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv


In [2]:
input_path = "/kaggle/input/commonlit-evaluate-student-summaries/"
df_train_pro_file = pd.read_csv(input_path + 'prompts_train.csv')
df_train_sum_file = pd.read_csv(input_path + 'summaries_train.csv')

In [3]:
import torch
print(torch.cuda.is_available())

device = "cuda" if torch.cuda.is_available() else "cpu"

True


# Creating a Dataset

In [4]:
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

def prepare_data(df_prompts, df_summaries):
    set_config(transform_output="pandas")
    scaler = ColumnTransformer([("scaled", StandardScaler(), ['content', 'wording'])], remainder='passthrough')
    df_summaries = scaler.fit_transform(df_summaries)
    d = {}
    for c in df_summaries.columns:
        if c.startswith('scaled__'):
            d[c] = c[8:]
        elif c.startswith('remainder__'):
            d[c] = c[11:]
    df_summaries = df_summaries.rename(columns = d)
    df_merged = pd.merge(df_prompts, df_summaries, how='left', on='prompt_id')
    
    # TODO need some string cleaning?
    return df_merged



In [5]:
df = prepare_data(df_train_pro_file, df_train_sum_file)
df.sample(3)

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,content,wording,student_id,text
5263,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1.383675,0.308263,0dc8cd731fe7,The different ways the factories would use and...
6856,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",0.872908,-0.34002,d86654b16974,They used many wild methods of masking the sme...
3506,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,-0.926129,-1.434231,ba5f4e4383f4,The structure of the ancient egyptian system ...


In [6]:
df.describe()

Unnamed: 0,content,wording
count,7165.0,7165.0
mean,7.933485e-18,2.3800450000000002e-17
std,1.00007,1.00007
min,-1.643519,-1.833577
25%,-0.7519837,-0.7815319
50%,-0.0756695,-0.01804831
75%,0.4930662,0.5472181
max,3.751981,4.221879


In [7]:
from datasets import Dataset

df = df[['text', 'content']]
ds = Dataset.from_pandas(df)
ds = ds.train_test_split(test_size=0.1, shuffle=True) # TODO stratify_by_column="prompt_id",  does not work!
#ds = ds.remove_columns(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text','student_id', 'wording'])
ds = ds.rename_column("content", "labels") # because the model expect a field beginning with label
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 6448
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 717
    })
})

In [8]:
# TODO splits for CV

In [9]:
ds["train"][3]

{'text': 'Three elements of an ideal tragedy would include death or multiple deaths, a single issue plot, and it should induce pity and fear into the audience.',
 'labels': -0.9978115162295333}

# Creating metrics

In [10]:
def get_MCRMSE_score(eval_pred):
    # columnwise root mean squared error:
    preds, labels = eval_pred
         
    by_column = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(by_column)
    return mcrmse

r = (np.array([[0.2, 0.4, 0.2], [1, 0.4, 1]]), np.array([[0.2, 0.2, 0.2], [1, 1, 1]]))
print(get_MCRMSE_score(r))

0.14907119849998599


In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    mcrmse = get_MCRMSE_score(eval_pred)
    
    return {"mcrmse": mcrmse, "mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

# Creating the tokenizer

In [12]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized = ds.map(tokenize_function, batched=True)
tokenized = tokenized.with_format(type='torch')
tokenized

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6448
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 717
    })
})

In [13]:
# checking the tokenizer
phrase = 'these duties without assistance. The pharaoh appointed a chief minister called a vizier.'
tokens = tokenizer.encode(phrase, return_tensors='pt', truncation=True, padding=True) #, max_length=10
print(tokens)
print(tokenizer.decode(tokens[0]))
tokens_plus = tokenizer.encode_plus(phrase, return_tensors='pt', truncation=True, padding=True)
print(tokens_plus)
print(tokenizer.decode(tokens_plus.input_ids[0]))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[    1,   378,  5311,   497,  2472,   260,   279, 72139,  4368,   266,
          2785,  3931,   650,   266, 32336,  5133,   260,     2]])
[CLS] these duties without assistance. The pharaoh appointed a chief minister called a vizier.[SEP]
{'input_ids': tensor([[    1,   378,  5311,   497,  2472,   260,   279, 72139,  4368,   266,
          2785,  3931,   650,   266, 32336,  5133,   260,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[CLS] these duties without assistance. The pharaoh appointed a chief minister called a vizier.[SEP]


In [14]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

# Creating the Model

In [15]:
# preparing a super small dataset to test
small_train_dataset = tokenized["train"].shuffle().select(range(100))
small_test_dataset = tokenized["test"].shuffle().select(range(100))
small_test_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=1, problem_type="regression").to(device) # 1 for regression

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import torch

tokens = tokens.to(device)
output = model(tokens)
print(output) # no loss!


SequenceClassifierOutput(loss=None, logits=tensor([[0.1474]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir ='/kaggle/working',          
    num_train_epochs = 3,     
    per_device_train_batch_size = 20,   
    per_device_eval_batch_size = 20, 
    #hidden_dropout_prob= 0.0 # 0.005
    #attention_probs_dropout_prob=0.0 # 0.005
    weight_decay = 0.021,               
    learning_rate = 1.5e-5,
    #logging_dir = './logs',            
    save_total_limit = 10,
    #load_best_model_at_end = True,     
    #metric_for_best_model = 'rmse',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    #logging_steps = 100,
    report_to="none",
) 

# TODO review how the batching is done
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [19]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mcrmse,Mse,Rmse,Mae,R2,Smape
1,No log,1.215184,0.88579,1.215184,1.102354,0.875855,0.009206,174.516641
2,No log,1.195469,0.879778,1.195469,1.093375,0.863803,0.02528,176.885879
3,No log,1.188088,0.878557,1.188088,1.089995,0.859935,0.031299,177.389766


TrainOutput(global_step=15, training_loss=1.0552360534667968, metrics={'train_runtime': 27.8268, 'train_samples_per_second': 10.781, 'train_steps_per_second': 0.539, 'total_flos': 39740926464000.0, 'train_loss': 1.0552360534667968, 'epoch': 3.0})

In [20]:
trainer.evaluate()

{'eval_loss': 1.188088059425354,
 'eval_mcrmse': 0.8785567283630371,
 'eval_mse': 1.188088059425354,
 'eval_rmse': 1.0899945497512817,
 'eval_mae': 0.8599350452423096,
 'eval_r2': 0.031298664047446745,
 'eval_smape': 177.389765625,
 'eval_runtime': 1.6915,
 'eval_samples_per_second': 59.118,
 'eval_steps_per_second': 2.956,
 'epoch': 3.0}

# Predicting

In [21]:
phrase = ds['test'][24]['text']
expected = ds['test'][24]['labels']
ds['test'][15]

{'text': 'The andent egyptian systam of government is a system that held everyone in place, the pharohs were at the top and the slaves were at the bottom. The way everyone was treated was different the pharohs were treated like royalty, while the slaves were treates like wild animals. ',
 'labels': -0.07566950483183152}

In [22]:
tokens = tokenizer.encode(phrase, return_tensors='pt', truncation=True, padding=True) 
tokens = tokens.to(device)
output = model(tokens)
print(output)
expected, output.logits

SequenceClassifierOutput(loss=None, logits=tensor([[0.0385]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


(-0.9261285475237545,
 tensor([[0.0385]], device='cuda:0', grad_fn=<AddmmBackward0>))

# Move the model offline

In [23]:
save_path = '/kaggle/working/deberta_v3_small_pretrained'
!mkdir {save_path}
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
!ls {save_path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
added_tokens.json  special_tokens_map.json  tokenizer_config.json
config.json	   spm.model
pytorch_model.bin  tokenizer.json
