# Feedback Prize Best score 0.49

In [3]:
device = torch.device('cuda') 

In [4]:
import pandas as pd
df = pd.read_csv('train.csv')

In [5]:
all_descs = df['full_text'].to_list()

In [7]:
df.columns

Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions', 'spelling_mistakes',
       'contractions', 'symbols', 'unique_words'],
      dtype='object')

In [8]:
from torch.utils.data import Dataset
import numpy as np
import torch
import torch.nn as nn
import nltk
import re
import contractions
#from sentence_transformers import SentenceTransformer
from nltk.stem import PorterStemmer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModel,AutoTokenizer,DebertaV2ForSequenceClassification,DebertaV2Tokenizer

class DebertaGrammerDataset(Dataset):
    pos_tag_vocab = ['CC',
            'WRB',
            'EX',
            'MD',
            'VBN',
            'VBD',
            'NNS',
            'RBR',
            'VBZ',
            'PRP$',
            'VB',
            'RP',
            'WP',
            'VBP',
            'JJR',
            'VBG',
            'PDT',
            'JJ',
            'JJS',
            'WDT',
            'IN',
            'DT',
            'RB',
            'NN',
            'PRP',
            'TO']
    stemmer = PorterStemmer()
    def __init__(self, data):
        '''
        Dataset object for base model
        :param data:
        '''
        #self.tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")
        data['full_text'] = data["full_text"].replace(re.compile(r'[\n\r\t]'), '', regex=True)
        
        self.inputs_ids,self.token_types_ids,self.attention_mask, self.pos_tag = self.clean_text(data['full_text'])
        self.cohesion = np.array(data['cohesion'])
        self.syntax = np.array(data['syntax'])
        self.vocab = np.array((data['vocabulary']))
        self.phraseology = np.array(data['phraseology'])
        self.grammer = np.array(data['grammar'])
        self.conventions = np.array(data['conventions'])
        
        self.spelling_mistakes = np.array(data['spelling_mistakes'])
        self.contractions = np.array(data['contractions'])
        self.symbols = np.array(data['symbols'])
        self.unique_words = np.array(data['unique_words'])
        
        
        
        
        

    def __len__(self):
        return len(self.syntax)
    
    def feature_transformation(self, vals):
        self.mapping = {1:0,1.5:1,2:2,2.5:3,3:4,3.5:5,4:6,4.5:7,5:8}
        newl = []
        for v in vals:
            newl.append(self.mapping[v])
        return newl
    

    def count_pos_tag(self,text):
        tag_dict = {}
        pos = nltk.pos_tag(text)
        tag_types = [item[1] for item in pos]
        return tag_types
    
    def tokenize(self,text,padding=1024):
        tokens = self.tokenizer(text,padding=True, truncation=True)
        if len(tokens['input_ids']) < padding:
            chars_to_add = int(padding-len(tokens['input_ids']))
            for i in range(chars_to_add):
                tokens['input_ids'].append(0)
        else:
            tokens['input_ids'] = tokens['input_ids'][:padding]
        
#         if len(tokens['token_type_ids']) < padding:
#             chars_to_add = int(padding-len(tokens['token_type_ids']))
#             for i in range(chars_to_add):
#                 tokens['token_type_ids'].append(0)
#         else:
#             tokens['token_type_ids'] = tokens['token_type_ids'][:padding]
            
            
        if len(tokens['attention_mask']) < padding:
            chars_to_add = int(padding-len(tokens['attention_mask']))
            for i in range(chars_to_add):
                tokens['attention_mask'].append(0)
        else:
            tokens['attention_mask'] = tokens['attention_mask'][:padding]
        return tokens
    
    def sentence_embds(self,text):
        embeddings = self.model.encode(text)
        return embeddings
    
    def clean_sentence(self,text):
        text = ''.join([i for i in text if not i.isdigit()])
        text = re.sub(r'(!|.)1+', '', text) 
        text = contractions.fix(text)
        text = self.stemmer.stem(text)
        return text
        
    def clean_text(self,all_text):
        inputs_ids = []
        token_type_ids = []
        attention_mask = []
        sentence_embds = []
        pos_tag = []
        #all_text = all_text.split("\n")
        for text in all_text:
            text = self.clean_sentence(text)
            text = text.strip()
            #tokens = self.tokenize(text)
            inputs_ids.append(text)
            token_type_ids.append(0)
            attention_mask.append(0)
            pos_sentence = self.count_pos_tag(text.split())
            pos_tag.append(pos_sentence)
            
            
        return np.array(inputs_ids),np.array(token_type_ids),np.array(attention_mask), np.array(pos_tag)
    def __getitem__(self, idx):
        return self.inputs_ids[idx],self.token_types_ids[idx],self.attention_mask[idx],self.cohesion[idx],self.syntax[idx],self.vocab[idx],self.phraseology[idx],self.grammer[idx],self.conventions[idx],self.pos_tag[idx],self.spelling_mistakes[idx],self.contractions[idx],self.symbols[idx],self.unique_words[idx]

In [9]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss


class MCRMSELoss(nn.Module):
    def __init__(self, num_scored=6):
        super().__init__()
        self.rmse = RMSELoss()
        self.num_scored = num_scored

    def forward(self, yhat, y):
        score = 0
        for i in range(self.num_scored):
            score += self.rmse(yhat[:, i], y[:,i]) / self.num_scored

        return score

In [10]:
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        classifier_dropout = 0.1
        self.dense = [nn.Linear(config.hidden_size, 128), nn.Tanh(), nn.Dropout(classifier_dropout), nn.Linear(128, 32),nn.Tanh(), nn.Dropout(classifier_dropout),
                      nn.Linear(32, config.num_labels)]
        self.dense = nn.Sequential(*self.dense)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
#         x = torch.tanh(x)
#         x = self.dropout(x)
#         x = self.out_proj(x)
        return x

In [11]:
from transformers import RobertaPreTrainedModel,DebertaV2Model
from typing import List, Optional, Tuple, Union
from transformers.modeling_outputs import SequenceClassifierOutput

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
    
class NewModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.deberta = DebertaV2Model.from_pretrained("microsoft/deberta-v3-base") #RobertaModel(config, add_pooling_layer=False)
        #self.classifier = RobertaClassificationHead(config)
        self.average_pooling = MeanPooling()
        
        self.num_neurons = 768
        self.fc_hidden = 64
        self.hidden_size = 32
        dropout = 0.1
        self.labels = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
        self.featurs_nn = []
        self.cohesion_nn =  nn.Sequential(*[nn.Linear(self.num_neurons, self.fc_hidden), nn.ReLU(inplace=True), nn.Linear(self.fc_hidden, self.hidden_size), nn.ReLU(inplace=True),
                  nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(inplace=True), nn.Linear(self.hidden_size, 1)])
        
        self.syntax_nn =  nn.Sequential(*[nn.Linear(self.num_neurons, self.fc_hidden), nn.ReLU(inplace=True), nn.Linear(self.fc_hidden, self.hidden_size), nn.ReLU(inplace=True),
                  nn.Linear(self.hidden_size, 1)])
        
        
        self.vocabulary_nn =  nn.Sequential(*[nn.Linear(self.num_neurons, self.fc_hidden), nn.ReLU(inplace=True), nn.Linear(self.fc_hidden, self.hidden_size), nn.ReLU(inplace=True),
                  nn.Linear(self.hidden_size, 1)])
        
        self.phraseology_nn =  nn.Sequential(*[nn.Linear(self.num_neurons, self.fc_hidden), nn.ReLU(inplace=True), nn.Linear(self.fc_hidden, self.hidden_size), nn.ReLU(inplace=True),
                  nn.Linear(self.hidden_size, 1)])
        
        self.grammar_nn =  nn.Sequential(*[nn.Linear(self.num_neurons, self.fc_hidden), nn.ReLU(inplace=True), nn.Linear(self.fc_hidden, self.hidden_size), nn.ReLU(inplace=True),
                  nn.Linear(self.hidden_size, 1)])
        
        self.conventions_nn =  nn.Sequential(*[nn.Linear(self.num_neurons, self.fc_hidden), nn.ReLU(inplace=True), nn.Linear(self.fc_hidden, self.hidden_size), nn.ReLU(inplace=True),
                  nn.Linear(self.hidden_size, 1)])
        
        self.features_nn = [self.cohesion_nn, self.syntax_nn, self.vocabulary_nn, self.phraseology_nn, self.grammar_nn,self.conventions_nn]
        
        
        
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        spelling_mistakes = None,
        contractions = None,
        symbols = None,
        unique = None
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,output_hidden_states=True)

        
        pooling = self.average_pooling(outputs.last_hidden_state,attention_mask)

        
        output_dict = {}
        for f_nn, feature in zip(self.features_nn,self.labels):
            newh = pooling.clone()
            #newh = torch.cat((newh,spelling_mistakes,contractions,symbols,unique),dim=1)
            out = f_nn(newh)
            output_dict[feature] = out
            
        logits = torch.stack((output_dict['cohesion'],output_dict['syntax'],output_dict['vocabulary'],output_dict['phraseology'],output_dict['grammar'],output_dict['conventions']),dim=1)
        #logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.SmoothL1Loss()
            logits = torch.squeeze(logits,-1)
            
            loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [12]:
def collate_batch(batch):
    inputs_ids, token_types_ids,attention_mask, cohesion, syntax,vocab,phraseology,grammer,conventions,pos_tag = [],[],[],[],[],[],[],[],[],[]
    spelling_mistakes,contractions,symbols,unique_words = [],[],[],[]
    for data in batch:
        inputs_ids.append(data[0])
        token_types_ids.append(data[1])
        attention_mask.append(data[2])
        cohesion.append(data[3])
        syntax.append(data[4])
        vocab.append(data[5])
        phraseology.append(data[6])
        grammer.append(data[7])
        conventions.append(data[8])
        pos_tag.append(data[9])
        spelling_mistakes.append(data[10])
        contractions.append(data[11])
        symbols.append(data[12])
        unique_words.append(data[13])
        
    # self.self.spelling_mistakes[idx],self.self.contractions[idx],self.symbols[idx],self.unique_words[idx]
    cohesion = torch.tensor(cohesion, dtype=torch.int64)
    syntax = torch.tensor(syntax, dtype=torch.int64)
    vocab = torch.tensor(vocab, dtype=torch.int64)
    phraseology = torch.tensor(phraseology, dtype=torch.int64)
    grammer = torch.tensor(grammer, dtype=torch.int64)
    conventions = torch.tensor(conventions, dtype=torch.int64)
    pos_tag_vals = pos_tag #torch.tensor(pos_tag, dtype=torch.int64)
    #inputs_ids = torch.tensor(inputs_ids, dtype=torch.int64)
    token_types_ids = torch.tensor(token_types_ids, dtype=torch.int64)
    attention_mask = torch.tensor(attention_mask, dtype=torch.int64)
    spelling_mistakes = torch.tensor(spelling_mistakes, dtype=torch.int64)
    contractions = torch.tensor(contractions, dtype=torch.int64)
    symbols = torch.tensor(symbols, dtype=torch.int64)
    unique_words = torch.tensor(unique_words, dtype=torch.int64)
    #sentence_embds =  torch.tensor(sentence_embds, dtype=torch.int64)
    return inputs_ids,token_types_ids.to(device),attention_mask.to(device), cohesion.to(device), syntax.to(device),vocab.to(device), phraseology.to(device),grammer.to(device), conventions.to(device),pos_tag_vals,spelling_mistakes.to(device),contractions.to(device),symbols.to(device),unique_words.to(device)

In [13]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
train=df.sample(frac=0.95,random_state=444) #random state is a seed value
test=df.drop(train.index)
train_dataset = DebertaGrammerDataset(train)
valid_dataset = DebertaGrammerDataset(test)



In [14]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset,batch_size=8, shuffle=True,collate_fn=collate_batch,pin_memory=False)
valid_loader = DataLoader(valid_dataset,batch_size=8, shuffle=True,collate_fn=collate_batch,pin_memory=False)


In [16]:
from transformers import AutoModel,AutoTokenizer,DebertaV2ForSequenceClassification,DebertaV2Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
model = NewModel()
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical 

NewModel(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropout): St

In [17]:
import numpy as np
max_epochs = 100
save_path = 'test/deberta_v3.pth'
lr = 4.5e-5
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
best_valid_loss = 999
for epoch in range(max_epochs):
    all_losses = [] 
    print("Epoch {}".format(epoch))
    for batch_idx, batch_data in enumerate(train_loader):
        input_idx,input_tokens,input_attention,cohesion,syntax,vocab,phraseology,grammer,conventions,pos_tag,spelling,contractions,symbols,unique = batch_data
        input_tensor = tokenizer(input_idx,return_tensors="pt",padding=True, truncation=True, max_length=1024)
        input_tensor.to(device)
        gt_format = torch.stack((cohesion,syntax,vocab,phraseology,grammer,conventions),dim=1)
        gt_format = gt_format.to(torch.float32)
        gt_format.to(device)
        input_tensor['labels'] = gt_format

        
        
        output = model(**input_tensor)
        logits = output.logits
        loss = output.loss
        
        all_losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print("Train loss ", np.mean(all_losses))
    lr *= 0.9 
    valid_loss = []
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(valid_loader):
            input_idx,input_tokens,input_attention,cohesion,syntax,vocab,phraseology,grammer,conventions,pos_tag,spelling,contractions,symbols,unique = batch_data
            input_tensor = tokenizer(input_idx,return_tensors="pt",padding=True, truncation=True,max_length=1024)
            input_tensor.to(device)
            gt_format = torch.stack((cohesion,syntax,vocab,phraseology,grammer,conventions),dim=1)
            gt_format = gt_format.to(torch.float32)
            gt_format.to(device)
            input_tensor['labels'] = gt_format

            output = model(**input_tensor)
            logits = output.logits
            loss = output.loss
            valid_loss.append(loss.item())
              
    avg_valid_loss =np.mean(valid_loss)
    if epoch > 1: 
        if avg_valid_loss < best_valid_loss:
            torch.save(model.state_dict(), save_path)
            best_valid_loss = avg_valid_loss
            print("Best model saved, ", avg_valid_loss)
    print("Validation loss: ", avg_valid_loss)
            
    

Epoch 0


  scale, dtype=query_layer.dtype
  score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
  score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)


Train loss  0.44948378754779694
Validation loss:  0.1645028129220009
Epoch 1
Train loss  0.14573077861374864
Validation loss:  0.14986733272671698
Epoch 2
Train loss  0.12120268951791029
Validation loss:  0.15482176393270491
Epoch 3
Train loss  0.10183356985605012
Best model saved,  0.1541392095386982
Validation loss:  0.1541392095386982
Epoch 4
Train loss  0.09001581081344436
Validation loss:  0.15493054166436196
Epoch 5
Train loss  0.08317777843525012
Best model saved,  0.15068233981728554
Validation loss:  0.15068233981728554
Epoch 6
Train loss  0.0761432715303575
Best model saved,  0.15060335770249367
Validation loss:  0.15060335770249367
Epoch 7
Train loss  0.06896869397411744
Validation loss:  0.15297766104340554
Epoch 8
Train loss  0.06380152289445201
Validation loss:  0.16083695888519287
Epoch 9
Train loss  0.05563814801474412
Validation loss:  0.15992644503712655
Epoch 10
Train loss  0.04799817546348398
Validation loss:  0.1672554597258568
Epoch 11
Train loss  0.03978809505448

KeyboardInterrupt: 

In [13]:
from torch.utils.data import Dataset
import numpy as np
import torch
import torch.nn as nn
import nltk
import re
import contractions
#from sentence_transformers import SentenceTransformer
from nltk.stem import PorterStemmer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModel,AutoTokenizer,DebertaV2ForSequenceClassification,DebertaV2Tokenizer

class DebertaGrammerTestDataset(Dataset):
    pos_tag_vocab = ['CC',
            'WRB',
            'EX',
            'MD',
            'VBN',
            'VBD',
            'NNS',
            'RBR',
            'VBZ',
            'PRP$',
            'VB',
            'RP',
            'WP',
            'VBP',
            'JJR',
            'VBG',
            'PDT',
            'JJ',
            'JJS',
            'WDT',
            'IN',
            'DT',
            'RB',
            'NN',
            'PRP',
            'TO']
    stemmer = PorterStemmer()
    def __init__(self, data):
        '''
        Dataset object for base model
        :param data:
        '''
        self.tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")
        data['full_text'] = data["full_text"].replace(re.compile(r'[\n\r\t]'), '', regex=True)
        
        self.inputs_ids,self.token_types_ids,self.attention_mask, self.pos_tag = self.clean_text(data['full_text'])
        self.text_ids = data['text_id']
        

    def __len__(self):
        return len(self.text_ids)
    

    def count_pos_tag(self,text):
        tag_dict = {}
        pos = nltk.pos_tag(text)
        tag_types = [item[1] for item in pos]
        return tag_types
    
    def tokenize(self,text,padding=1024):
        tokens = self.tokenizer(text,padding=True, truncation=True)
        if len(tokens['input_ids']) < padding:
            chars_to_add = int(padding-len(tokens['input_ids']))
            for i in range(chars_to_add):
                tokens['input_ids'].append(0)
        else:
            tokens['input_ids'] = tokens['input_ids'][:padding]
        
#         if len(tokens['token_type_ids']) < padding:
#             chars_to_add = int(padding-len(tokens['token_type_ids']))
#             for i in range(chars_to_add):
#                 tokens['token_type_ids'].append(0)
#         else:
#             tokens['token_type_ids'] = tokens['token_type_ids'][:padding]
            
            
        if len(tokens['attention_mask']) < padding:
            chars_to_add = int(padding-len(tokens['attention_mask']))
            for i in range(chars_to_add):
                tokens['attention_mask'].append(0)
        else:
            tokens['attention_mask'] = tokens['attention_mask'][:padding]
        return tokens
    
    def sentence_embds(self,text):
        embeddings = self.model.encode(text)
        return embeddings
    
    def clean_sentence(self,text):
        text = ''.join([i for i in text if not i.isdigit()])
        text = re.sub(r'(!|.)1+', '', text) 
        text = contractions.fix(text)
        text = self.stemmer.stem(text)
        return text
        
    def clean_text(self,all_text):
        inputs_ids = []
        token_type_ids = []
        attention_mask = []
        sentence_embds = []
        pos_tag = []
        #all_text = all_text.split("\n")
        for text in all_text:
            text = self.clean_sentence(text)
            text = text.strip()
            #tokens = self.tokenize(text)
            inputs_ids.append(text)
            token_type_ids.append(0)
            attention_mask.append(0)
            pos_sentence = self.count_pos_tag(text.split())
            pos_tag.append(pos_sentence)
            
            
        return np.array(inputs_ids),np.array(token_type_ids),np.array(attention_mask), np.array(pos_tag)
    def __getitem__(self, idx):
        return self.inputs_ids[idx],self.text_ids[idx]

In [14]:
#  self.inputs_ids[idx],self.token_types_ids[idx],self.attention_mask[idx],self.pos_tag[idx],self.text_ids[idx]
def collate_test_batch(batch):
    text_list,text_length,pos_tag,text_id = [], [],[],[]
    for data in batch:
        #processed_text = text_pipeline(data[0],1024)
        text_list.append(data[0])
        text_id.append(data[1])
        
        
        
    #text_length = torch.tensor(text_length, dtype=torch.int64)
    #text_list = torch.tensor(text_list, dtype=torch.int64)
    #pos_tag = torch.tensor(pos_tag, dtype=torch.int64)
    return text_list,text_id

In [16]:
from torch.utils.data import DataLoader

testdf = pd.read_csv('test.csv')
test_dataset = DebertaGrammerTestDataset(testdf)
test_loader = DataLoader(test_dataset,batch_size=1, shuffle=False,collate_fn=collate_test_batch,pin_memory=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
from transformers import AutoModel,AutoTokenizer,DebertaV2ForSequenceClassification,DebertaV2Tokenizer
load_path = 'deberta_v3_last'
tokenizer = DebertaV2Tokenizer.from_pretrained("deberta_tokenizer")
# loading model 
newmodel = NewModel()
newmodel.load_state_dict(torch.load(load_path))

In [28]:
cohs = []
syntax = []
vocab = [] 
phraseology = []
grammer = []
conventions = []
text_id = []
for batch_idx, batch_data in enumerate(test_loader):
        input_idx,text_ids = batch_data
        input_tensor = tokenizer(input_idx,return_tensors="pt",padding=True, truncation=True, max_length=1024)
        
        
        output = model(**input_tensor)
        logits = output.logits
        
        logits = logits.tolist()
        for i in range(len(logits)):
            cohs.append(logits[i][0])
            syntax.append(logits[i][1])
            vocab.append(logits[i][2])
            phraseology.append(logits[i][3])
            grammer.append(logits[i][4])
            conventions.append(logits[i][5])
            text_id.append(text_ids[i])

        

test_pred = pd.DataFrame()
test_pred['text_id'] = [item for item in text_id]
test_pred['cohesion'] =[item for item in cohs]
test_pred['syntax'] = [item for item in syntax]
test_pred['vocabulary'] = [item for item in vocab]
test_pred['phraseology'] = [item for item in phraseology]
test_pred['grammar'] = [item for item in grammer]
test_pred['conventions'] =[item for item in conventions]

  scale, dtype=query_layer.dtype
  score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
  score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)


In [29]:
test_pred

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.940789,2.728086,3.186163,2.850531,2.766949,2.464682
1,000BAD50D026,2.313389,2.660667,2.692108,1.981151,1.955681,2.404436
2,00367BB2546B,3.132265,2.958166,3.372356,3.140762,3.108786,3.276099
