In [1]:
import numpy as np 
import pandas as pd
import torch
import os
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GroupKFold
from transformers import AutoModel, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print(f'cuda? {torch.cuda.is_available()}')
device = "cuda" if torch.cuda.is_available() else "cpu"



/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv
/kaggle/input/deberta-v3-large/deberta-v3-large/spm.model
/kaggle/input/deberta-v3-large/deberta-v3-large/config.json
/kaggle/input/deberta-v3-large/deberta-v3-large/README.md
/kaggle/input/deberta-v3-large/deberta-v3-large/tf_model.h5
/kaggle/input/deberta-v3-large/deberta-v3-large/tokenizer_config.json
/kaggle/input/deberta-v3-large/deberta-v3-large/pytorch_model.bin
/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
cuda? True


In [2]:
!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl

from spellchecker import SpellChecker

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [3]:
input_path = "/kaggle/input/commonlit-evaluate-student-summaries/"


output_path = "/kaggle/working/"
df_train_pro_file = pd.read_csv(input_path + 'prompts_train.csv')
df_train_sum_file = pd.read_csv(input_path + 'summaries_train.csv')
df_test_pro_file = pd.read_csv(input_path + 'prompts_test.csv')
df_test_sum_file = pd.read_csv(input_path + 'summaries_test.csv')

In [4]:
cfg_effort = 'big'

#model_path = "../input/debertav3small/"
model_path = "../input/deberta-v3-large/deberta-v3-large"

cfg_folds = 2

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, local_files_only=True)

def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
set_config(transform_output="pandas")

# transform text to add prompt and question
columns = ["text", "content", "wording", "student_id", "prompt_id"] 

def create_input(df):
    s = tokenizer.sep_token
    df['text'] = df["prompt_title"] + s + df["prompt_question"] + s + df["text"] # no "prompt_text"]
    return df

df_train = pd.merge(df_train_pro_file, df_train_sum_file, how='left', on='prompt_id')
df_train = create_input(df_train)
df_train = df_train[columns]

# create folds for cv
group_kfold = GroupKFold(n_splits=4)
groups = df_train["prompt_id"]
df_train['fold'] = ''

for f, (train_index, test_index) in enumerate(group_kfold.split(df_train, None, groups)):
    print(f"Fold {f}: {train_index[:5]}, {test_index[:5]}")
    print(f"groups in Train: index={train_index[:5]}, group={set([groups[i] for i in train_index])}")
    print(f"groups in Test: index={test_index[:5]}, group={set([groups[i] for i in test_index])}")
    df_train.loc[test_index, 'fold'] = f 
    
set([(r,p) for r,p in df_train[["fold", "prompt_id"]].values])

Fold 0: [2057 2058 2059 2060 2061], [0 1 2 3 4]
groups in Train: index=[2057 2058 2059 2060 2061], group={'3b9047', '814d6b', 'ebad26'}
groups in Test: index=[0 1 2 3 4], group={'39c16e'}
Fold 1: [0 1 2 3 4], [2057 2058 2059 2060 2061]
groups in Train: index=[0 1 2 3 4], group={'ebad26', '814d6b', '39c16e'}
groups in Test: index=[2057 2058 2059 2060 2061], group={'3b9047'}
Fold 2: [0 1 2 3 4], [5169 5170 5171 5172 5173]
groups in Train: index=[0 1 2 3 4], group={'3b9047', '814d6b', '39c16e'}
groups in Test: index=[5169 5170 5171 5172 5173], group={'ebad26'}
Fold 3: [0 1 2 3 4], [4066 4067 4068 4069 4070]
groups in Train: index=[0 1 2 3 4], group={'ebad26', '3b9047', '39c16e'}
groups in Test: index=[4066 4067 4068 4069 4070], group={'814d6b'}


{(0, '39c16e'), (1, '3b9047'), (2, 'ebad26'), (3, '814d6b')}

In [7]:
ds = Dataset.from_pandas(df_train)

df_test = pd.merge(df_test_pro_file, df_test_sum_file, how='left', on='prompt_id')
df_test['content'] = ''
df_test['wording'] = ''
df_test = create_input(df_test)
df_test = df_test[columns]

dataset_big = DatasetDict({'train': ds, 'submission': Dataset.from_pandas(df_test)})
print(dataset_big)
dataset_big['submission']['text'][:1][0][:512]

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'fold'],
        num_rows: 7165
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 4
    })
})


'Example Title 1[SEP]Summarize...[SEP]Example text 1'

In [8]:
dataset = None

if cfg_effort == 'big':
    dataset = dataset_big
else:
    ds_folds = concatenate_datasets([
        dataset_big['train'].filter(lambda x: x['fold'] == 0).shuffle().select(range(25)), 
        dataset_big['train'].filter(lambda x: x['fold'] == 1).shuffle().select(range(22)),
        dataset_big['train'].filter(lambda x: x['fold'] == 2).shuffle().select(range(24)),
        dataset_big['train'].filter(lambda x: x['fold'] == 3).shuffle().select(range(21))
    ])
    dataset = DatasetDict({'train': ds_folds, 'submission': dataset_big['submission']})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id', 'fold'],
        num_rows: 7165
    })
    submission: Dataset({
        features: ['text', 'content', 'wording', 'student_id', 'prompt_id'],
        num_rows: 4
    })
})

In [9]:
def get_MCRMSE_score(eval_pred):
    # columnwise root mean squared error
    preds, labels = eval_pred
    print(f"Computing MCRMSE score for preds: {len(preds)} and labels {len(labels)}")
         
    by_column = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(by_column)
    return mcrmse

r = (np.array([[0.2, 0.4, 0.2], [1, 0.4, 1]]), np.array([[0.2, 0.2, 0.2], [1, 1, 1]]))
print(get_MCRMSE_score(r))

Computing MCRMSE score for preds: 2 and labels 2
0.14907119849998599


In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    
    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [11]:
class OneLabelRegressor:
    def __init__(self, target, tokenizer, dataset):
        self.target = target
        self.trainer = None
        self.tokenizer = tokenizer
        self.data_collator = DataCollatorWithPadding(tokenizer)
        self.dataset = dataset
        
        #self.model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-xsmall', num_labels=1, problem_type="regression").to(device) # 1 for regression
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_path, 
            num_labels=1, 
            problem_type="regression",
            hidden_dropout_prob=0.005,
            attention_probs_dropout_prob=0.005
        ).to(device)
        
        
        self.training_args = TrainingArguments(
            output_dir ='/kaggle/working' + self.target,          
            num_train_epochs = 3,     
            per_device_train_batch_size = 3,   # 16 for deberta-v3-small 3 for deberta-v3-large
            per_device_eval_batch_size = 3, 
            weight_decay = 0.021,               
            learning_rate = 1.5e-5,
            save_total_limit = 10,
            logging_strategy = "epoch",
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            report_to="none",
        ) 
    
    def run_fold(self, fold):
        # select fold
        
        tokens_train = self.dataset['train'].filter(lambda x: x['fold'] != fold).map(tokenize_function, batched=True)
        tokens_test =  self.dataset['train'].filter(lambda x: x['fold'] == fold).map(tokenize_function, batched=True)
        
        if cfg_folds == 2:
            if fold == 0:
                tokens_train = self.dataset['train'].filter(lambda x: x['fold'] < 2).map(tokenize_function, batched=True)
                tokens_test =  self.dataset['train'].filter(lambda x: x['fold'] >= 2).map(tokenize_function, batched=True)
            else:
                tokens_train = self.dataset['train'].filter(lambda x: x['fold'] >= 2).map(tokenize_function, batched=True)
                tokens_test =  self.dataset['train'].filter(lambda x: x['fold'] < 2).map(tokenize_function, batched=True)
        
        tokens_train = tokens_train.with_format(type='torch')
        tokens_test = tokens_test.with_format(type='torch')
        
        self.trainer = Trainer(
            model=self.model, 
            args=self.training_args, 
            train_dataset=tokens_train,
            eval_dataset=tokens_test,
            #tokenizer = self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )
        self.trainer.train()
        # predict on train
        eval_preds = self.trainer.predict(tokens_test).predictions.squeeze()
        eval_labels = self.dataset['train'].filter(lambda x: x['fold'] == fold)["labels"]
        
        if cfg_folds == 2:
            if fold == 0:
                eval_labels = self.dataset['train'].filter(lambda x: x['fold'] >= 2)["labels"]
            else:
                eval_labels = self.dataset['train'].filter(lambda x: x['fold'] < 2)["labels"]
              
        return eval_preds, eval_labels, self.trainer.evaluate()
        
        
    def run(self):
        self.dataset = self.dataset.rename_column(self.target, "labels") # because the model expects it
        rmse = 0
        loss = 0
        eval_subs = [0.0 for i in range(len(self.dataset['submission']))]
        #eval_preds = [0.0 for i in range(len(self.dataset['train']))]
        eval_preds = []
        eval_labels = []
        
        for f in range(cfg_folds):
            print()
            print(f"Training fold {f} for target {self.target} now")
            preds, labels, eval_metrics = self.run_fold(f) 
            print(f"Fold {f} trained. Returns {len(preds)} preds with {len(labels)} labels.")
            #for i in range(len(preds)):
            #    eval_preds[i + f * len(preds)] += preds[i]
            eval_preds.extend(preds)
            eval_labels.extend(labels)
            print(f"After fold {f} {len(eval_preds)} preds accumulated and {len(eval_labels)} labels.")
            rmse += eval_metrics['eval_rmse']
            loss += eval_metrics['eval_loss']

        # predict on submission
        tokens_sub =  self.dataset['submission'].map(tokenize_function, batched=True)
        tokens_sub = tokens_sub.with_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
        eval_subs = self.trainer.predict(tokens_sub).predictions.squeeze()
        print(f"Predicted submission for target {self.target} : {eval_subs}")  

        self.dataset = self.dataset.rename_column("labels", self.target)        
        
        return eval_subs, (eval_preds, eval_labels), {'eval_rmse': rmse / 4, 'eval_loss': loss / 4} 

In [12]:
metrics_set = {}
subs_set = {}
preds_set = {}

for t in ['content', 'wording']:
    r = OneLabelRegressor(t, tokenizer, dataset)
    subs_set[t], preds_set[t], metrics_set[t] = r.run()


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/deberta-v3-large/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training fold 0 for target content now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.2727,0.295229,0.295229,0.543349,0.41321,0.73006,80.660949
2,0.1491,0.217649,0.217649,0.466529,0.348201,0.800994,69.73978
3,0.084,0.241025,0.241025,0.490943,0.369014,0.77962,73.502894


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 0 trained. Returns 3099 preds with 3099 labels.
After fold 0 3099 preds accumulated and 3099 labels.

Training fold 1 for target content now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.2388,0.163664,0.163664,0.404554,0.30339,0.849164,62.115731
2,0.1499,0.098485,0.098485,0.313823,0.24004,0.909234,55.199616
3,0.0649,0.088959,0.088959,0.29826,0.225146,0.918014,53.416899


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 1 trained. Returns 4066 preds with 4066 labels.
After fold 1 7165 preds accumulated and 7165 labels.


  0%|          | 0/1 [00:00<?, ?ba/s]

Predicted submission for target content : [-1.5693408 -1.5761988 -1.5686285 -1.5731684]


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/deberta-v3-large/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training fold 0 for target wording now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.4716,0.473624,0.473624,0.688203,0.537426,0.587008,95.837679
2,0.2602,0.396823,0.396823,0.629939,0.472686,0.653977,86.495644
3,0.1403,0.457965,0.457965,0.676731,0.518882,0.600662,91.997156


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 0 trained. Returns 3099 preds with 3099 labels.
After fold 0 3099 preds accumulated and 3099 labels.

Training fold 1 for target wording now


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.3593,0.314663,0.314663,0.560948,0.427511,0.689406,77.256548
2,0.194,0.162552,0.162552,0.403177,0.29776,0.83955,64.949029
3,0.0895,0.150549,0.150549,0.388007,0.28546,0.851397,62.037083


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Fold 1 trained. Returns 4066 preds with 4066 labels.
After fold 1 7165 preds accumulated and 7165 labels.


  0%|          | 0/1 [00:00<?, ?ba/s]

Predicted submission for target wording : [-1.5208714 -1.5202525 -1.5170654 -1.5052145]


In [13]:
df_preds = pd.DataFrame()
df_labels = pd.DataFrame()
for t in ["content", "wording"]:
    df_preds[t] = preds_set[t][0]
    df_labels[t] = preds_set[t][1]
eval_pred = (df_preds.to_numpy(), df_labels.to_numpy())
score = get_MCRMSE_score(eval_pred)
print(f'Competition Score Test = {score} with RMSE content = {metrics_set["content"]["eval_rmse"]} and wording = {metrics_set["wording"]["eval_rmse"]}')

Computing MCRMSE score for preds: 7165 and labels 7165
Competition Score Test = 0.462908379595264 with RMSE content = 0.1973007246851921 and wording = 0.2661844566464424


In [14]:
print(dataset['train'].filter(lambda x: x['fold'] >= 2)[0])
print(preds_set['content'][1][0], preds_set['content'][0][0])

  0%|          | 0/8 [00:00<?, ?ba/s]

{'text': 'The Third Wave[SEP]Summarize how the Third Wave developed over such a short period of time and why the experiment was ended.[SEP]The third wave was an experimentto see how people reacted to a new one leader government. It gained popularity as people wanted to try new things. The students follow anything that is said and start turning on eachother to gain higher power. They had to stop the experement as too many people got to radical with it blindly following there leader', 'content': 0.205682506482641, 'wording': 0.380537638762288, 'student_id': '000e8c3c7ddb', 'prompt_id': '814d6b', 'fold': 3}
0.205682506482641 -0.1299326


In [15]:
# now train a regressor on the output with extra columns
df_ds = dataset['train'].to_pandas()
df_stage = pd.concat([df_ds[df_ds['fold'] >= 2], df_ds[df_ds['fold'] < 2]], axis=0)

for t in ["content", "wording"]:
    df_stage['pred_' + t] = preds_set[t][0]
    df_stage['labels_' + t] = preds_set[t][1]

df_stage.head()

Unnamed: 0,text,content,wording,student_id,prompt_id,fold,pred_content,labels_content,pred_wording,labels_wording
4066,The Third Wave[SEP]Summarize how the Third Wav...,0.205683,0.380538,000e8c3c7ddb,814d6b,3,-0.129933,0.205683,0.775685,0.380538
4067,The Third Wave[SEP]Summarize how the Third Wav...,3.272894,3.219757,0070c9e7af47,814d6b,3,1.757223,3.272894,2.510018,3.219757
4068,The Third Wave[SEP]Summarize how the Third Wav...,0.205683,0.380538,0095993991fe,814d6b,3,0.010931,0.205683,0.878031,0.380538
4069,The Third Wave[SEP]Summarize how the Third Wav...,0.567975,0.969062,00c20c6ddd23,814d6b,3,0.305905,0.567975,1.108085,0.969062
4070,The Third Wave[SEP]Summarize how the Third Wav...,-0.910596,-0.081769,00d40ad10dc9,814d6b,3,-1.111217,-0.910596,-0.226701,-0.081769


In [16]:
print(df_stage[abs(df_stage['content'] - df_stage['labels_content']) > 1])
print(df_stage[abs(df_stage['wording'] - df_stage['labels_wording']) > 1])

Empty DataFrame
Columns: [text, content, wording, student_id, prompt_id, fold, pred_content, labels_content, pred_wording, labels_wording]
Index: []
Empty DataFrame
Columns: [text, content, wording, student_id, prompt_id, fold, pred_content, labels_content, pred_wording, labels_wording]
Index: []


In [17]:
!pip install pyspellchecker

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
df_stage['length-text'] = df_stage['text'].str.len()

spell = SpellChecker()
spell.distance = 2

df_stage['spelling-errors'] = df_stage['text'].map(lambda x : len(spell.unknown([i for i in x.split()])))

In [19]:
df_stage = df_stage.reset_index(drop=True)
df_stage.head()

Unnamed: 0,text,content,wording,student_id,prompt_id,fold,pred_content,labels_content,pred_wording,labels_wording,length-text,spelling-errors
0,The Third Wave[SEP]Summarize how the Third Wav...,0.205683,0.380538,000e8c3c7ddb,814d6b,3,-0.129933,0.205683,0.775685,0.380538,475,7
1,The Third Wave[SEP]Summarize how the Third Wav...,3.272894,3.219757,0070c9e7af47,814d6b,3,1.757223,3.272894,2.510018,3.219757,1354,31
2,The Third Wave[SEP]Summarize how the Third Wav...,0.205683,0.380538,0095993991fe,814d6b,3,0.010931,0.205683,0.878031,0.380538,474,8
3,The Third Wave[SEP]Summarize how the Third Wav...,0.567975,0.969062,00c20c6ddd23,814d6b,3,0.305905,0.567975,1.108085,0.969062,580,16
4,The Third Wave[SEP]Summarize how the Third Wav...,-0.910596,-0.081769,00d40ad10dc9,814d6b,3,-1.111217,-0.910596,-0.226701,-0.081769,274,5


In [20]:
# do the same for submission
df_sub = dataset['submission'].to_pandas()
for t in ["content", "wording"]:
    df_sub['pred_' + t] = subs_set[t]
df_sub['spelling-errors'] = df_sub['text'].map(lambda x : len(spell.unknown([i for i in x.split()])))
df_sub['length-text'] = df_sub['text'].str.len()
df_sub.head()

Unnamed: 0,text,content,wording,student_id,prompt_id,pred_content,pred_wording,spelling-errors,length-text
0,Example Title 1[SEP]Summarize...[SEP]Example t...,,,000000ffffff,abc123,-1.569341,-1.520871,1,51
1,Example Title 1[SEP]Summarize...[SEP]Example t...,,,222222cccccc,abc123,-1.576199,-1.520252,1,51
2,Example Title 2[SEP]Summarize...[SEP]Example t...,,,111111eeeeee,def789,-1.568629,-1.517065,1,51
3,Example Title 2[SEP]Summarize...[SEP]Example t...,,,333333dddddd,def789,-1.573168,-1.505214,1,51


In [21]:
cv = KFold(n_splits=5)
scores = []

def compute_scores(clf, cv_strat, df_x, df_y, col_name):
    for i in ['neg_mean_squared_error', 'neg_mean_absolute_error']:
        scores.append([col_name, i, cross_val_score(clf, df_x, df_y, cv=cv_strat, scoring=i).mean()])
    return scores

In [22]:
# regressor for content
df_content_x = df_stage[['pred_content', 'pred_wording', 'length-text', 'spelling-errors']]
df_content_y = df_stage['content']

clf = XGBRegressor()

In [23]:
#from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV

#param_grid = {
#    'n_estimators': [30, 40, 50, 50, 70, 80, 90, 100, 120, 130, 140, 150, 200], 
#    'max_depth': [3, 5, 7, 9, 11, 13, 15, 17], 
#    'learning_rate': [1, 0.5, 0.1, 0.05, 0.01,0.005],
#}
#grid_search = RandomizedSearchCV(
#   estimator = clf,
#   param_distributions = param_grid,
#   scoring = 'neg_mean_squared_error', 
#   n_iter = 1000,
#   n_jobs = 10,
#   cv = 5,
#   verbose = True
#)
#grid_search.fit(df_content_x, df_content_y)
#grid_search.best_params_

# Fitting 5 folds for each of 624 candidates, totalling 3120 fits
# {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}

In [24]:
parameters = {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

clf = XGBRegressor(**parameters)

clf.fit(df_content_x, df_content_y)
preds_last_content = pd.DataFrame(clf.predict(df_content_x), columns=['content'])

compute_scores(clf, cv, df_content_x, df_content_y, 'XGB content')

[['XGB content', 'neg_mean_squared_error', -0.15597115000538692],
 ['XGB content', 'neg_mean_absolute_error', -0.2932811574564137]]

In [25]:
# compute submission for content
df_sub_x = df_sub[['pred_content', 'pred_wording', 'length-text', 'spelling-errors']]
preds_sub_content = pd.DataFrame(clf.predict(df_sub_x), columns=['content'])
preds_sub_content

Unnamed: 0,content
0,-1.43805
1,-1.43805
2,-1.43805
3,-1.43805


In [26]:
# regressor for wording
df_wording_x = df_stage[['pred_content', 'pred_wording', 'length-text', 'spelling-errors']]
df_wording_y = df_stage['wording']

clf = XGBRegressor()

In [27]:
#param_grid = {
#    'n_estimators': [30, 40, 50, 50, 70, 80, 90, 100, 120, 130, 140, 150, 200], 
#    'max_depth': [3, 5, 7, 9, 11, 13, 15, 17], 
#    'learning_rate': [1, 0.5, 0.1, 0.05, 0.01,0.005],
#}
#grid_search = RandomizedSearchCV(
#   estimator = clf,
#   param_distributions = param_grid,
#   scoring = 'neg_mean_squared_error', 
#   n_iter = 1000,
#   n_jobs = 10,
#   cv = 5,
#   verbose = True
#)
#grid_search.fit(df_content_x, df_content_y)
#grid_search.best_params_

# Fitting 5 folds for each of 624 candidates, totalling 3120 fits
# {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}

In [28]:
parameters = {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}

clf = XGBRegressor(**parameters)

clf.fit(df_wording_x, df_wording_y)

preds_last_wording = pd.DataFrame(clf.predict(df_wording_x), columns=['wording'])

compute_scores(clf, cv, df_wording_x, df_wording_y, 'XGB wording')

[['XGB content', 'neg_mean_squared_error', -0.15597115000538692],
 ['XGB content', 'neg_mean_absolute_error', -0.2932811574564137],
 ['XGB wording', 'neg_mean_squared_error', -0.26459856500539664],
 ['XGB wording', 'neg_mean_absolute_error', -0.37571971579195323]]

In [29]:
preds_last = pd.concat([preds_last_content, preds_last_wording], axis=1)
labels_last = pd.concat([df_content_y, df_wording_y], axis=1)

score_last = get_MCRMSE_score((preds_last.to_numpy(), labels_last.to_numpy()))
print(score_last)
print(f'{score - score_last} gained! before was {score} is now {score_last}')

Computing MCRMSE score for preds: 7165 and labels 7165
0.42190226754550164
0.041006112049762344 gained! before was 0.462908379595264 is now 0.42190226754550164


In [30]:
# compute submission for wording
df_sub_x = df_sub[['pred_content', 'pred_wording', 'length-text', 'spelling-errors']]
preds_sub_wording = pd.DataFrame(clf.predict(df_sub_x), columns=['wording'])
preds_sub_wording

Unnamed: 0,wording
0,-1.415215
1,-1.415215
2,-1.415215
3,-1.415215


In [31]:
# submission
df_sub_output = pd.concat([df_sub['student_id'], preds_sub_content, preds_sub_wording], axis=1)
df_sub_output

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.43805,-1.415215
1,222222cccccc,-1.43805,-1.415215
2,111111eeeeee,-1.43805,-1.415215
3,333333dddddd,-1.43805,-1.415215


In [32]:
df_sub_output.to_csv(output_path + 'submission.csv', index=False)

In [33]:
#!tail submission.csv