## Library Import

In [2]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import torch
import seaborn as sns
import warnings
import transformers

from tqdm import tqdm
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [3]:
# If you don't want your script to sync to the cloud
os.environ["WANDB_MODE"] = "offline"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Data Import

In [7]:
# For submission
path = '/kaggle/input/commonlit-evaluate-student-summaries'

# For testing locally
path = '../kaggle/input/commonlit-evaluate-student-summaries'

if os.name == 'nt':
    path = f'.{path}'
elif os.name == 'posix':
    pass

# Actual submission
# test_prompts_df = pd.read_csv(f'{path}/prompts_test.csv')
# test_summaries_df = pd.read_csv(f'{path}/summaries_test.csv')
# X = test_summaries_df.merge(test_prompts_df, on='prompt_id').drop(columns=['prompt_id'])
# X.head()

# Train submission
train_summaries_df = pd.read_csv(f'{path}/summaries_train.csv')
train_prompts_df = pd.read_csv(f'{path}/prompts_train.csv')
merged_df = pd.merge(train_summaries_df, train_prompts_df, on='prompt_id')
merged_df.head()

# Local testing, get first 10 rows
sample_train = pd.read_csv('../data/v2/train/train_data.csv')
sample_prompts = pd.read_csv('../kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
X = sample_train.merge(sample_prompts, on='prompt_id').drop(columns=['prompt_id']).head(10)
X.head()

Unnamed: 0,student_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,8a31b8cc1996,In the social pyramid of ancient Egypt the pha...,-0.077267,0.424365,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
1,4387107feb4d,The ancient Egyptian system of government was ...,1.376083,2.389443,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,3b784d0a5c8f,Nobles were the only ont that could hold gover...,0.467722,-0.085653,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,1b2ff4d4edd9,They were many different social classes. The p...,-0.012957,-0.40948,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,108049c01946,The ancient Egyptian system of goverment is in...,2.20464,-0.645344,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...


## Split

In [8]:
wording = merged_df.wording.tolist()
content = merged_df.content.tolist()
# 70 Train, 15 Val, 15 Test
X_train, X_test, y_train, y_test = train_test_split(merged_df[['text', 'prompt_question', 'prompt_title', 'prompt_text']], merged_df[['content', 'wording']], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [11]:
BASE_MODEL = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.save_pretrained('./tokenizers/distilbert-base-uncased-tokenizer')

('./tokenizers/distilbert-base-uncased-tokenizer/tokenizer_config.json',
 './tokenizers/distilbert-base-uncased-tokenizer/special_tokens_map.json',
 './tokenizers/distilbert-base-uncased-tokenizer/vocab.txt',
 './tokenizers/distilbert-base-uncased-tokenizer/added_tokens.json',
 './tokenizers/distilbert-base-uncased-tokenizer/tokenizer.json')

## Tokenization

In [14]:
BASE_MODEL = "distilbert-base-uncased"
MAX_LENGTH = 512
tokenizer = AutoTokenizer.from_pretrained('./tokenizers/distilbert-base-uncased-tokenizer/')

train = Dataset.from_pandas(pd.DataFrame({
    'text': X_train['text'],
    'prompt_question': X_train['prompt_question'],
    'prompt_title': X_train['prompt_title'],
    'prompt_text': X_train['prompt_text'],
    'content_score': y_train['content'],
    'wording_score': y_train['wording'],
}))

validation = Dataset.from_pandas(pd.DataFrame({
    'text': X_val['text'],
    'prompt_question': X_val['prompt_question'],
    'prompt_title': X_val['prompt_title'],
    'prompt_text': X_val['prompt_text'],
    'content_score': y_val['content'],
    'wording_score': y_val['wording'],
}))

test = Dataset.from_pandas(pd.DataFrame({
    'text': X_test['text'],
    'prompt_question': X_test['prompt_question'],
    'prompt_title': X_test['prompt_title'],
    'prompt_text': X_test['prompt_text'],
    'content_score': y_test['content'],
    'wording_score': y_test['wording'],
}))

ds_content = {
    "train": train, 
    "validation": validation, 
    "test": test
    }



ds_wording = {
    "train": train, 
    "validation": validation, 
    "test": test
    }


def preprocess_function_content(examples):
    label = examples["content_score"] 
    examples = tokenizer(
        examples["text"], 
        add_special_tokens=True,
        text_pair=examples['prompt_question'] + ' ' + examples['prompt_title'] + ' ' + examples['prompt_text'],
        truncation=True, 
        padding="max_length", 
        max_length=MAX_LENGTH,
    )
    
    # Change this to real number
    examples["label"] = float(label)
    return examples


def preprocess_function_wording(examples):
    label = examples["wording_score"] 
    examples = tokenizer(
        examples["text"], 
        add_special_tokens=True,
        text_pair=examples['prompt_question'] + ' ' + examples['prompt_title'] + ' ' + examples['prompt_text'],
        truncation=True, 
        padding="max_length", 
        max_length=MAX_LENGTH,
    )
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

for split in ds_content:
    ds_content[split] = ds_content[split].map(preprocess_function_content, remove_columns=["text", "content_score", 'wording_score','prompt_question', 'prompt_title', 'prompt_text'])

for split in ds_wording:
    ds_wording[split] = ds_wording[split].map(preprocess_function_wording, remove_columns=["text", "content_score", 'wording_score','prompt_question', 'prompt_title', 'prompt_text'])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4012 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

Map:   0%|          | 0/4012 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

## Training

In [16]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from transformers import Trainer

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}



class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [17]:
BASE_MODEL = "distilbert-base-uncased"

# Already Trained
# CONTENT_BASE_MODEL = "./models/distil-bert/distilbert-base-uncased-predict-content-score/"
# WORDING_BASE_MODEL = "./models/distil-bert/distilbert-base-uncased-predict-wording-with-all-text-pair/"



### Content model training

In [18]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import TrainingArguments

LEARNING_RATE = 2e-4
BATCH_SIZE = 32
EPOCHS = 5

content_model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
training_args = TrainingArguments(
    output_dir=f"./models/text_pair/{BASE_MODEL}-predict-content-score",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

trainer = RegressionTrainer(
    model=content_model,
    args=training_args,
    train_dataset=ds_content["train"],
    eval_dataset=ds_content["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/630 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Wording model training

In [19]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import TrainingArguments

LEARNING_RATE = 1e-4
BATCH_SIZE = 32
EPOCHS = 4

wording_model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
training_args = TrainingArguments(
    output_dir=f"./models/text_pair/{BASE_MODEL}-predict-wording-score",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

trainer = RegressionTrainer(
    model=wording_model,
    args=training_args,
    train_dataset=ds_wording["train"],
    eval_dataset=ds_wording["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/504 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Submission

In [47]:
# Pre process

def to_device(data, device):
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def preprocess_function(examples):
    examples = tokenizer(
        examples["text"], 
        add_special_tokens=True,
        text_pair=examples['prompt_question'] + ' ' + examples['prompt_title'] + ' ' + examples['prompt_text'],
        truncation=True, 
        padding="max_length", 
        max_length=512,
        return_tensors="pt"
    )
    return to_device(examples, device)

content_preds = []
wording_preds = []
student_id = []
for index, row in tqdm(X.iterrows(), total=X.shape[0]):
    inputs = preprocess_function(row[['text', 'prompt_question', 'prompt_title', 'prompt_text', ]])
    # Content predicting
    outputs = content_model(**inputs)
    content_preds += outputs.logits.reshape(-1).tolist()
    # Woridng predicting
    outputs = wording_model(**inputs)
    wording_preds += outputs.logits.reshape(-1).tolist()  
    student_id.append(row['student_id'])

submission_df = pd.DataFrame({'student_id': student_id, 'content': content_preds, 'wording': wording_preds})
submission_df.head()

100%|██████████| 4/4 [00:01<00:00,  2.88it/s]


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.475036,0.475036
1,222222cccccc,0.489419,0.489419
2,111111eeeeee,0.499346,0.499346
3,333333dddddd,0.48995,0.48995


In [48]:
submission_df.to_csv('submission.csv', index=False)
display(pd.read_csv('submission.csv'))

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.475036,0.475036
1,222222cccccc,0.489419,0.489419
2,111111eeeeee,0.499346,0.499346
3,333333dddddd,0.48995,0.48995
