In [1]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaModel,
    RobertaConfig
)
from tqdm import tqdm

In [2]:
df_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
sub = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
df_train

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [3]:
class PaddedDataset(Dataset):
    @staticmethod
    def clean_text(text):
        return re.sub('[^A-Za-z0-9.,]+', ' ', str(text).lower())

    def __init__(self, df, tokenizer, max_len, is_test=False):

        self.text = df.excerpt.tolist()
        if not is_test:
            self.target = df.target.tolist()
        self.is_test = is_test
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.clean_text(self.text[item])
        
        inputs = self.tokenizer(self.text[item], 
                                max_length=self.max_len, 
                                truncation=True)
        input_ids = inputs["input_ids"]
        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + ([1] * padding_length)
        attention_mask = inputs["attention_mask"] + ([0] * padding_length)
        
        if self.is_test:
            return {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "attention_mask": torch.tensor(attention_mask, dtype=torch.long)
            }
    
        return {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
                "target": torch.tensor(self.target[item], dtype=torch.float)
        }

In [4]:
class CommonLitModel(nn.Module):
    def __init__(self):
        super().__init__()
        config = RobertaConfig.from_pretrained("../input/commonlit-finetune/data/config.json")
#         config = RobertaConfig.from_pretrained("roberta-large")
        self.roberta = RobertaModel(config=config)
        self.W = nn.Linear(1024, 1024)
        self.V = nn.Linear(1024, 1)
        self.dropout = nn.Dropout(0.3) 
        self.fc = nn.Linear(1024, 1)
        
    def forward(self, ids, attention):
        output = self.roberta(
            input_ids=ids, 
            attention_mask = attention)
        output = output[0]
        
        attention = torch.tanh(self.W(output))
        score = self.V(attention)
        attention_weights = torch.softmax(score, dim=1)
        output = attention_weights * output
        output = torch.sum(output, dim=1)

        output = self.dropout(output)
        output = self.fc(output)
        
        return output

In [5]:
MAX_LEN = 256
BATCH_SIZE = 8
DEVICE = "cuda"

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("../input/commonlit-finetune/data")

test_dataset = PaddedDataset(df_test, tokenizer, max_len=MAX_LEN, is_test=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

outputss = []

for fold in range(5):
    print("fold:", fold)
    model = CommonLitModel()
    model.load_state_dict(torch.load(f"../input/commonlit-finetune/model{fold}/model.bin"))
    model.to(DEVICE)

    model.eval()

    outputs = []

    with torch.no_grad():
        for data in tqdm(test_dataloader):
            input_ids = data["input_ids"]
            attention_mask = data["attention_mask"]

            input_ids = input_ids.to(DEVICE, dtype=torch.long)
            attention_mask = attention_mask.to(DEVICE, dtype=torch.long)

            output = model(input_ids, attention_mask)
            outputs.extend(output.cpu().detach().view(-1).numpy().tolist())
            
    outputss.append(outputs)
    
sub['target'] = np.mean(outputss, axis=0)
sub.to_csv('submission.csv', index=False)

fold: 0


100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


fold: 1


100%|██████████| 1/1 [00:00<00:00,  5.07it/s]


fold: 2


100%|██████████| 1/1 [00:00<00:00,  5.39it/s]


fold: 3


100%|██████████| 1/1 [00:00<00:00,  4.78it/s]


fold: 4


100%|██████████| 1/1 [00:00<00:00,  5.31it/s]
