In [18]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
# imports the torch_xla package
os.environ["WANDB_DISABLED"] = "true"

In [19]:
df = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
#df = pd.read_csv('../input/add-train/add_train.csv')

In [20]:
class CFG:
    model_path = '../input/d/jonathanchan/deberta-v3-large/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    max_input_length = 1024
    batch_size = 4
    seed = 1006
    num_workers = 2
    device='cuda'
    print_freq = 100
    
class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.num_labels = 6
        config.max_position_embeddings = CFG.max_input_length
        config.attention_probs_dropout_prob = 0
        config.hidden_dropout_prob = 0
        self.backbone = AutoModelForSequenceClassification.from_config(config=config)
        
    def forward(self, input_ids, attention_mask, labels=None):
        base_output = self.backbone(input_ids=input_ids,
                                attention_mask=attention_mask,
                               )

        output = base_output[0]
        #output = self.cls(self.dropout(output))
        if labels is None:
            return output
        
        else:
            return (nn.MSELoss()(output,labels), output)
        
class Custom_Bert_Mean_with_GRU(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CFG.model_path)
        config.output_hidden_states = True
        config.max_position_embeddings = CFG.max_input_length
        config.attention_probs_dropout_prob = 0
        config.hidden_dropout_prob = 0
        self.backbone = AutoModel.from_config(config=config)
        dim = config.hidden_size
        # Multidropout
        self.dropout1 = nn.Dropout(0)  # dropout 0.1
        self.dropout2 = nn.Dropout(0)  # dropout 0.2
        self.dropout3 = nn.Dropout(0)  # dropout 0.3
        self.dropout4 = nn.Dropout(0)  # dropout 0.4
        self.dropout5 = nn.Dropout(0)  # dropout 0.5

        # GRU
        self.rnn = nn.GRU(
            input_size=dim,  # 输入大小
            hidden_size=dim // 2,  # 隐藏层大小
            bidirectional=True,  # 双向
            batch_first=True,  # batch在最前面的维度
            dropout=0.,  # dropout 0.1
            num_layers=1  # 层数
        )

        # Head
        self.head = nn.Linear(dim, 6)
        # GRU  # 线性层

        self._init_weights(self.head, config)  # 初始化权重

    def _init_weights(self, module, config):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask, labels=None):
        base_output = self.backbone(input_ids=input_ids, attention_mask=attention_mask)

        output = base_output.last_hidden_state
        output_backbone = self.rnn(output)[0]  # GRU

        output1 = self.head(self.dropout1(torch.mean(output_backbone, dim=1)))  # dropout 0.1 + 线性层
        output2 = self.head(self.dropout2(torch.mean(output_backbone, dim=1)))  # dropout 0.2 + 线性层
        output3 = self.head(self.dropout3(torch.mean(output_backbone, dim=1)))  # dropout 0.3 + 线性层
        output4 = self.head(self.dropout4(torch.mean(output_backbone, dim=1)))  # dropout 0.4 + 线性层
        output5 = self.head(self.dropout5(torch.mean(output_backbone, dim=1)))  # dropout 0.5 + 线性层

        output = (output1 + output2 + output3 + output4 + output5) / 5  # 平均
        #print(output.shape)
        
        return output

        

In [21]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        #self.labels = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values
        self.texts = df['full_text'].values
        self.tokenizer = tokenizer
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):

        text = self.texts[idx]

        output_ids = tokenizer(text,
                              padding='max_length', max_length=CFG.max_input_length, truncation=True)
        return torch.as_tensor(output_ids['input_ids'], dtype=torch.long), \
               torch.as_tensor(output_ids['attention_mask'], dtype=torch.long)

In [22]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
def valid_fn(valid_loader, model, device):
    model.to(device)
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        with torch.no_grad():
            y_preds = model(input_ids, mask)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
    predictions = np.concatenate(preds)
    return  predictions

In [24]:
va_dataset = TestDataset(df,tokenizer)
val_loader = DataLoader(va_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [25]:
res = []


In [26]:
for i in range(5):
    model = Custom_Bert_Mean_with_GRU()
    model.load_state_dict(torch.load(f'../input/fb-sim/microsoft_deberta-v3-large_gru_best{i}.pth')['model'])
    prediction = valid_fn(val_loader, model, 'cuda')
    res.append(prediction)
    del model

In [27]:
res = np.mean(res, axis=0)
res.shape

(3, 6)

In [28]:
res = np.where(res<1, 1, res)
res = np.where(res>5, 5, res)


In [29]:
submission = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv")
submission.cohesion = res[:, 0]
submission.syntax = res[:, 1]
submission.vocabulary = res[:, 2]
submission.phraseology = res[:, 3]
submission.grammar = res[:, 4]
submission.conventions = res[:, 5]

In [30]:
submission

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.928136,2.749842,3.074777,2.940013,2.561346,2.546719
1,000BAD50D026,2.591225,2.473002,2.687952,2.380739,2.237844,2.654334
2,00367BB2546B,3.481908,3.431592,3.552463,3.583048,3.406031,3.244173


In [31]:
submission.to_csv("submission.csv", index = False)