## Imports

In [1]:
!pip install accelerate -U

Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/13/9e/ee987874058f2d93006961f6ff49e0bcb60ab9c26709ebe06bfa8707a4d8/accelerate-0.24.1-py3-none-any.whl.metadata
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.23.0
    Uninstalling accelerate-0.23.0:
      Successfully uninstalled accelerate-0.23.0
Successfully installed accelerate-0.24.1


In [2]:
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import pandas as pd
import torch
import seaborn as sns

## Data import and preprocessing

In [3]:
summaries_df = pd.read_csv('../data/v2/train/train_data.csv')
summaries_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,8a31b8cc1996,3b9047,In the social pyramid of ancient Egypt the pha...,-0.077267,0.424365
1,8c9411cfc953,39c16e,Aristotle claims that an ideal tragedy should ...,0.55907,-0.634924
2,4387107feb4d,3b9047,The ancient Egyptian system of government was ...,1.376083,2.389443
3,d720eb53c270,ebad26,They put pickle in them to mask the smell of r...,0.297031,-0.168734
4,e887883b946c,ebad26,"""whenever meat was so spoiled that it could no...",-0.093814,0.503833


In [4]:
prompts_df = pd.read_csv('../kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
prompts_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [5]:
merged_df = pd.merge(summaries_df, prompts_df, on='prompt_id')
merged_df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,8a31b8cc1996,3b9047,In the social pyramid of ancient Egypt the pha...,-0.077267,0.424365,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
1,4387107feb4d,3b9047,The ancient Egyptian system of government was ...,1.376083,2.389443,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,3b784d0a5c8f,3b9047,Nobles were the only ont that could hold gover...,0.467722,-0.085653,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,1b2ff4d4edd9,3b9047,They were many different social classes. The p...,-0.012957,-0.409480,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,108049c01946,3b9047,The ancient Egyptian system of goverment is in...,2.204640,-0.645344,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
...,...,...,...,...,...,...,...,...
5727,d8ae24010bf9,814d6b,The Third Wave experiment was a huge success i...,3.005642,3.226292,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5728,e32a76187192,814d6b,The Third Wave developed over such a short tim...,0.997243,1.880386,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5729,2c6d06ca1eea,814d6b,The experiment developed over such a short per...,-0.693773,-0.490571,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5730,47d4807beb66,814d6b,It was easy for the students to follow this be...,-0.093814,0.503833,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


In [6]:
texts = merged_df.text.tolist()
wording = merged_df.wording.tolist()

In [7]:
# 70 Train, 15 Val, 15 Test
X_train, X_test, y_train, y_test = train_test_split(texts, wording, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

## Tokenization

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = "bert-base-cased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 20

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from datasets import Dataset

train = Dataset.from_pandas(pd.DataFrame({
    'text': X_train,
    'wording_score': y_train
}))

validation = Dataset.from_pandas(pd.DataFrame({
    'text': X_val,
    'wording_score': y_val
}))

test = Dataset.from_pandas(pd.DataFrame({
    'text': X_test,
    'wording_score': y_test
}))

ds = {
    "train": train, 
    "validation": validation, 
    "test": test
}


def preprocess_function(examples):
    label = examples["wording_score"] 
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["text", "wording_score"])


Map:   0%|          | 0/4012 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

In [11]:
ds['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'label': Value(dtype='float64', id=None)}

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/bert-base-cased-regression",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [12]:
from transformers import Trainer

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [13]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

  0%|          | 0/5020 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.eval_dataset=ds["test"]
trainer.evaluate()