In [None]:
!pip install bs4
!pip install transformers

In [None]:
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    DistilBertConfig,
    DistilBertModel,
    DistilBertPreTrainedModel,
    TrainingArguments,
    Trainer,
)
from transformers.modeling_outputs import SequenceClassifierOutput

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import unicodedata
import re

import seaborn as sns
import matplotlib as mpl
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import random
import copy
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import LabelBinarizer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def set_seed(seed = 0):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

seed=1112
random_state = set_seed(seed)

In [None]:
train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv', low_memory=False)

In [None]:
def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()

def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_special_characters(text): 
    return re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', text)

def remove_extra_whitespace_tabs(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def preprocess(text):
    processed_text = remove_html_tags(text)
    processed_text = remove_accented_chars(processed_text)
    processed_text = remove_special_characters(processed_text)
    processed_text = remove_extra_whitespace_tabs(processed_text)
    return processed_text

In [None]:
train['excerpt'] = train['excerpt'].apply(lambda x: preprocess(x))

In [None]:
# Create segments between 5 and 15

def create_segments(total_bins):
    bin_len = int(len(train)/total_bins)
    bins = [(0, bin_len)]
    for seg in range(total_bins-1):
        if seg == total_bins-2:
            bins.append((bins[-1][1], np.nan))
        else:
            bins.append((bins[-1][1], bins[-1][1]+bin_len))
    return bins

def create_bins(df, column, num_of_bins=5):
    bins = create_segments(num_of_bins)
    df = df.sort_values(column, ascending=True).reset_index(drop=True)
    column_name = column+"_segment"
    df[column_name] = 0
    for index, seg in enumerate(bins):
        start = seg[0]
        end = seg[1]
        if end is not np.nan:
            df.loc[start:end, column_name] = str(index)
        else:
            df.loc[start:, column_name] = str(index)
    return df

In [None]:
bins = 5
train = create_bins(copy.deepcopy(train), "standard_error", bins)

In [None]:
plt.figure(figsize = (25, 11))
sns.kdeplot(train["target"], hue=train["standard_error_segment"])
plt.title("Target Distribution (segmented)", size=25)
plt.xlabel("Value", size=20)
plt.ylabel("Frequency", size=20);

In [None]:
plt.figure(figsize = (25, 11))
sns.stripplot(x=train.target, y=train.standard_error_segment, dodge=True)

In [None]:
for group in range(bins):
    print('-'*150)
    print('Details of group: ', group)
    print('Unique values length: ', len(np.unique(train[train.standard_error_segment == str(group)]["target"])))
    print('Group values length: ', len(train[train.standard_error_segment == str(group)]["target"]))

print('-'*150)
print('Overall Unique values: ', len(np.unique(train.target)))
print('Overall Unique values: ', len(train.target))

In [None]:
train.columns

In [None]:
lb = LabelBinarizer()
lb = lb.fit(train.standard_error_segment.values)
train['coded_target'] = lb.transform(train.standard_error_segment.values).tolist()
target = train["coded_target"].values.tolist()

In [None]:
train.drop(columns=['id', 'url_legal', 'license', 'target', 'standard_error', 'standard_error_segment', 'coded_target'], axis=1, inplace=True)
print("Overall size: ", train.shape)

X_train_val, X_test, Y_train_val, Y_test = train_test_split(train, target, stratify=target, test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, stratify=Y_train_val, test_size=0.2)

print("Train size: ", X_train.shape, len(Y_train), len(Y_train[0]))
print("Validation size: ", X_val.shape, len(Y_val), len(Y_val[0]))
print("Test size: ", X_test.shape, len(Y_test), len(Y_test[0]))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device} is used")

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")

In [None]:
class CLRPDataset(torch.utils.data.Dataset):
    def __init__(self, df, labels, tokenizer):
        self.excerpt = df['excerpt'].values.tolist()
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.excerpt[idx], padding=True, truncation=True, return_tensors='pt')
        item = {key: torch.squeeze(val) for key, val in encodings.items()}
        float_labels = [float(i) for i in self.labels[idx]]
        item['labels'] = torch.tensor(float_labels)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = CLRPDataset(X_train, Y_train, tokenizer)
eval_dataset = CLRPDataset(X_val, Y_val, tokenizer)
test_dataset = CLRPDataset(X_test, Y_test, tokenizer)

In [None]:
train_dataset[0]

In [None]:
#sanity check
tokenizer.decode(train_dataset[0]["input_ids"])

In [None]:
config = DistilBertConfig.from_pretrained("distilbert-base-cased")
config.num_labels = 5
config.problem_type = "multi_label_classification"

In [None]:
class LatestDistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

In [None]:
model = LatestDistilBertForSequenceClassification.from_pretrained("distilbert-base-cased", config=config)
model.to(device)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    converted_labels = [int(i) for i in lb.inverse_transform(labels)]
    return {'f1_score': f1_score(y_true=converted_labels, y_pred=predictions, average='micro')}

In [None]:
batch_size = 16
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

training_args = TrainingArguments(
    output_dir='./results',                  # output directory
    evaluation_strategy = "epoch",           # evaluate model after every epoch
    num_train_epochs=10,                     # total number of training epochs
    learning_rate=2e-5,                      # starting learning rate
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    warmup_steps=500,                        # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                       # strength of weight decay
    logging_dir='./logs',                    # directory for storing logs
    logging_steps=logging_steps,
    report_to="none",
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    compute_metrics = compute_metrics,
    tokenizer = tokenizer
)

In [None]:
# sanity check that we can run evaluation
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
model.save_pretrained('./results')