<a href="https://colab.research.google.com/github/yuriao/DataScienceProjects/blob/main/commonlit_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I will explore how to use basic BERT network for scoring, based only on students' response

### References
-

## Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline, AdamW
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [None]:
import logging
import warnings
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)

## Dataload

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

#prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
#prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")


In [None]:
summaries_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


## EDA: Check nans

In [None]:
summaries_train.isna().sum()

student_id    0
prompt_id     0
text          0
content       0
wording       0
dtype: int64

In [None]:
summaries_test.isna().sum()

student_id    0
prompt_id     0
text          0
dtype: int64

data is good

## Tokenizer and model

In [None]:
MODEL_DIR = "/kaggle/input/huggingface-bert/"
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR + "bert-large-uncased")
model = BertForSequenceClassification.from_pretrained(MODEL_DIR + "bert-large-uncased", num_labels=2)

## encoding training text

In [None]:
encodings = tokenizer(list(summaries_train['text']), return_tensors="pt", padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(list(summaries_test['text']), return_tensors="pt", padding=True, truncation=True, max_length=512)

## Convert scores to tensors

In [None]:
labels = torch.tensor(summaries_train[['content','wording']].values)

## to GPU

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Dataset class

In [None]:
class RegressionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

## fine-tuning metric

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

## kFold validation setup

In [None]:
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

## kFold training

In [None]:
for fold, (train_idx, val_idx) in enumerate(kfold.split(summaries_train['text'])):
    print(f"Fold {fold + 1}")

    train_encodings = {key: val[train_idx] for key, val in encodings.items()}
    val_encodings = {key: val[val_idx] for key, val in encodings.items()}
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]

    # Initialize datasets
    train_dataset = RegressionDataset(train_encodings, train_labels)
    val_dataset = RegressionDataset(val_encodings, val_labels)

    # Initialize the model for each fold
    #model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # num_labels=2 for dual regression

    # Training arguments and Trainer
    training_args = TrainingArguments(
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=10,
        evaluation_strategy="epoch",
        output_dir=f'/kaggle/working/results/fold_{fold + 1}',
        save_steps = 10000,
        overwrite_output_dir=True,
        metric_for_best_model="rmse",
        learning_rate=1.5e-5,
        weight_decay=0.02,
        greater_is_better=False,
        report_to='none' # disable wandb utilization
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

Fold 1


Epoch,Training Loss,Validation Loss,Rmse
1,-5.1644,-8.216731,16.85169
2,-11.4959,-13.965208,26.238828
3,-17.3669,-18.669228,38.660613
4,-22.695,-22.467704,39.899844
5,-27.5394,-27.686,51.682291
6,-32.2337,-30.322235,55.42936
7,-36.0853,-33.149878,62.483916


## Predict

In [None]:
with torch.no_grad():
    test_ids = []
    test_attention_mask = []
    test_ids.append(test_encodings['input_ids'])
    test_attention_mask.append(test_encodings['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
    predictions = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device)).logits

In [None]:
print(predictions)

## Create Submission file

In [None]:
summaries_test[["content", "wording"]]=predictions.cpu()
summaries_test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)