In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 33.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.4 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 45.0 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2


In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

In [None]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN =248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1.0, 1)]
MODEL_NAME = "roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def set_random_seed(random_seed):
  random.seed(random_seed)
  np.random.seed(random_seed)
  os.environ["PYTHONHASHSEED"] = str(random_seed)

  torch.manual_seed(random_seed)
  torch.cuda.manual_seed(random_seed)
  torch.cuda.manual_seed_all(random_seed)

  torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
train_df = pd.read_csv("/content/gdrive/My Drive/CommonLit Readability Prize/train.csv")
test_df = pd.read_csv("/content/gdrive/My Drive/CommonLit Readability Prize/test.csv")

In [None]:
# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index, inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [None]:
train_df.shape


(2833, 6)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




###DATASET

In [None]:
class LitDataset(Dataset):
  def __init__(self, df, inference_only=False):
    super().__init__()

    self.df = df
    self.inference_only = inference_only
    self.text = df.excerpt.tolist()
    self.text = [text.replace("\n"," ")for text in self.text]

    if not self.inference_only:
      self.target = torch.tensor(df.target.values, dtype=torch.float32)

    self.encoded = tokenizer.batch_encode_plus(self.text,
                                               padding='max_length',
                                               max_length=MAX_LEN,
                                               truncation=True,
                                               return_attention_mask=True
                                               )
    
  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    input_ids = torch.tensor(self.encoded['input_ids'][index])
    attention_mask = torch.tensor(self.encoded['attention_mask'][index])

    if self.inference_only:
      return (input_ids, attention_mask)

    else:
      target = self.target[index]
      return (input_ids, attention_mask, target)


###MODEL

In [None]:
class LitModel(nn.Module):
  def __init__(self):
    super().__init__()

    config = AutoConfig.from_pretrained(MODEL_NAME)
    config.update({"output_hidden_states":True,
                   "hidden_dropout_prob":0.0,
                   "layer_norm_eps": 1e-7})
    
    self.roberta = AutoModel.from_pretrained(MODEL_NAME, config=config)

    self.attention = nn.Sequential(
        nn.Linear(768, 512),
        nn.Tanh(),
        nn.Linear(512, 1),
        nn.Softmax(dim=1)
    )

    self.regressor = nn.Sequential(
        nn.Linear(768, 1)
    )

  def forward(self, input_ids, attention_mask):
    roberta_output = self.roberta(input_ids=input_ids,
                                  attention_mask=attention_mask)
    # There are a total of 13 layers of hidden states.
    # 1 for the embedding layer, and 12 for the 12 Roberta layers.
    # We take the hidden states from the last Roberta layer.
    last_layer_hidden_states = roberta_output.hidden_states[-1]
    
    # The number of cells is MAX_LEN.
    # The size of the hidden state of each cell is 768 (for roberta-base).
    # In order to condense hidden states of all cells to a context vector,
    # we compute a weighted average of the hidden states of all cells.
    # We compute the weight of each cell, using the attention neural network.

    weights = self.attention(last_layer_hidden_states)

    # weights.shape is BATCH_SIZE x MAX_LEN x 1
    # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
    # Now we compute context_vector as the weighted average.
    # context_vector.shape is BATCH_SIZE x 768
    context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)

    # Now we reduce the context vector to the prediction score.
    return self.regressor(context_vector)

In [None]:
def eval_mse(model, data_loader):
  """Evaluates the mean squared error of the |model| on |data_loader|"""
  model.eval()
  mse_sum = 0

  with torch.no_grad():
    for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
      input_ids = input_ids.to(DEVICE)
      attention_mask = attention_mask.to(DEVICE)
      target = target.to(DEVICE)

      pred = model(input_ids, attention_mask)

      mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()

  return mse_sum / len(data_loader.dataset)


In [None]:
def predict(model, data_loader):
  """Returns an np.array with predictions of the |model| on |data_loader|"""
  model.eval()

  result = np.zeros(len(data_loader.dataset))
  index = 0

  with torch.no_grad():
    for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
      input_ids = input_ids.to(DEVICE)
      attention_mask = attention_mask.to(DEVICE)

      pred = model(input_ids, attention_mask)

      result[index  :  index + pred.shape[0]] = pred.flatten().to("cpu")
      index += pred.shape[0]

  return result


In [None]:
def train(model, model_path, train_loader, valid_loader, 
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS):
  best_valid_rmse = None
  best_epoch = 0
  step = 0
  last_eval_step = 0
  eval_period = EVAL_SCHEDULE[0][1]

  start = time.time()

  for epoch in range(num_epochs):
    valid_rmse = None

    for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
      input_ids = input_ids.to(DEVICE)
      attention_mask = attention_mask.to(DEVICE)
      target = target.to(DEVICE)

      optimizer.zero_grad()

      model.train()

      pred = model(input_ids, attention_mask)

      mse = nn.MSELoss(reduction="mean")(pred.flatten(),target)

      mse.backward()

      optimizer.step()
      if scheduler:
        scheduler.step()

      if step >= last_eval_step + eval_period:
        # Evaluate the model on valid_loader.
        elapsed_seconds = time.time()-start
        num_steps = step - last_eval_step
        print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
        last_eval_step = step

        valid_rmse = math.sqrt(eval_mse(model, valid_loader))

        print(f"Epoch: {epoch} batch_num: {batch_num}",
              f"valid_rmse: {valid_rmse:0.4}")
        
        for rmse, period in EVAL_SCHEDULE:
          if valid_rmse >= rmse:
            eval_period = period
            break

        if not best_valid_rmse or valid_rmse < best_valid_rmse:
          best_valid_rmse = valid_rmse
          best_epoch = epoch
          torch.save(model.state_dict(), model_path)
          print(f"New best_valid_rmse: {best_valid_rmse:0.4}")
          
        else:
          print(f"Still best_valid_rmse: {best_valid_rmse:0.4}",
                f"(from epoch {best_epoch})")
          
        start = time.time()

      step += 1

  return best_valid_rmse

In [None]:
def create_optimizer(model):
  named_parameters = list(model.named_parameters())

  roberta_parameters = named_parameters[:197]
  attention_parameters = named_parameters[199:203]
  regressor_parameters = named_parameters[203:]

  attention_group = [params for (name,params) in attention_parameters]
  regressor_group = [params for (name, params) in regressor_parameters]

  parameters = []
  parameters.append({"params" : attention_group})
  parameters.append({"params" : regressor_group})

  for layer_num, (name, params) in enumerate(roberta_parameters):
    weight_decay = 0.0 if "bias" in name else 0.01

    lr = 2e-5

    if layer_num >= 69:
      lr = 5e-5

    if layer_num >= 133:
      lr = 1e-4
    
    parameters.append({"params": params,
                       "weight_decay":weight_decay,
                       "lr": lr})
    
  return AdamW(parameters)

In [None]:
SEED = 100
list_valid_rmse =[]

kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

for fold, (train_indices, valid_indices) in enumerate(kfold.split(train_df)):
  print(f"\n Fold {fold + 1}/{NUM_FOLDS}")
  model_path = f"model_{fold +1}.pth"

  set_random_seed(SEED + fold)

  train_dataset = LitDataset(train_df.loc[train_indices])
  valid_dataset = LitDataset(train_df.loc[valid_indices])

  train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                drop_last=True,shuffle=True,num_workers=2)
  valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False,num_workers=2)
  set_random_seed(SEED + fold)

  model = LitModel().to(DEVICE)

  optimizer = create_optimizer(model)
  scheduler = get_cosine_schedule_with_warmup(
      optimizer, num_training_steps=NUM_EPOCHS * len(train_loader),
      num_warmup_steps=50)
  
  list_valid_rmse.append(train(model, model_path, train_loader,
                               valid_loader, optimizer,scheduler=scheduler))
  
  del model

  print("\n Performance Estimates:")
  print(list_valid_rmse)
  print("Mean:", np.array(list_valid_rmse).mean())
  


 Fold 1/5


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



16 steps took 11.6 seconds
Epoch: 0 batch_num: 16 valid_rmse: 0.8931
New best_valid_rmse: 0.8931

16 steps took 11.0 seconds
Epoch: 0 batch_num: 32 valid_rmse: 0.6875
New best_valid_rmse: 0.6875

16 steps took 11.2 seconds
Epoch: 0 batch_num: 48 valid_rmse: 0.635
New best_valid_rmse: 0.635

16 steps took 11.5 seconds
Epoch: 0 batch_num: 64 valid_rmse: 0.6986
Still best_valid_rmse: 0.635 (from epoch 0)

16 steps took 11.7 seconds
Epoch: 0 batch_num: 80 valid_rmse: 0.612
New best_valid_rmse: 0.612

16 steps took 11.8 seconds
Epoch: 0 batch_num: 96 valid_rmse: 0.663
Still best_valid_rmse: 0.612 (from epoch 0)

16 steps took 12.2 seconds
Epoch: 0 batch_num: 112 valid_rmse: 0.614
Still best_valid_rmse: 0.612 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 0 batch_num: 128 valid_rmse: 0.5372
New best_valid_rmse: 0.5372

16 steps took 12.8 seconds
Epoch: 1 batch_num: 3 valid_rmse: 0.5368
New best_valid_rmse: 0.5368

16 steps took 12.5 seconds
Epoch: 1 batch_num: 19 valid_rmse: 0.5299
New b

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-f4d82e9f528c>", line 29, in <module>
    valid_loader, optimizer,scheduler=scheduler))
  File "<ipython-input-14-b61bd683b664>", line 27, in train
    mse.backward()
  File "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py", line 149, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_tracebac

KeyboardInterrupt: ignored

###INFERENCE

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)

In [None]:
all_predictions = np.zeros((len(list_valid_rmse), len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for index in range(len(list_valid_rmse)):
  model_path = f"model_{index + 1}.pth"
  print(f" \n Using  {model_path}")

  model = LitModel()
  model.load_state_dict(torch.load(model_path))
  model.to(DEVICE)

  all_predictions[index] = predict(model, test_loader)

  del model

 
 Using  model_1.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 
 Using  model_2.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 
 Using  model_3.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 
 Using  model_4.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 
 Using  model_5.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
predictions = all_predictions.mean(axis=0)
test_df["target"] = predictions
test_df.head()


Unnamed: 0,id,url_legal,license,excerpt,target
0,c0f722661,,,My hope lay in Jack's promise that he would ke...,-0.449816
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...,-0.504958
2,0df072751,,,It was a bright and cheerful scene that greete...,-0.438827
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...,-2.628084
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...,-1.886272


In [None]:
submission_df = test_df[["id","target"]]
submission_df.to_csv("submission.csv",index=False)