In [0]:
!pip install transformers



In [0]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertModel, AdamW, BertConfig, BertTokenizer, Model2Model, PreTrainedEncoderDecoder, BertPreTrainedModel
from torch.utils.data import Dataset
import torch
import re
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
config_path = '/content/drive/My Drive/Rubert/rubert_cased_L-12_H-768_A-12_pt/bert_config.json'
checkpoint_path = '/content/drive/My Drive/Rubert/rubert_cased_L-12_H-768_A-12_pt/pytorch_model.bin'
vocab_path = '/content/drive/My Drive/Rubert/rubert_cased_L-12_H-768_A-12_pt/vocab.txt'

In [0]:
label_dict = {'negative':0, 'neutral': 1, 'positive':2}

In [0]:
class SentimentDataset(Dataset):
    def __init__(self, path, label_dict, suftrain=True):
        self.df = pd.read_json(path)
        X_train, X_test = train_test_split(self.df, test_size=0.2, random_state=42)
        if suftrain == True:
          self.df = X_train.reset_index(drop=True)
        else:
          self.df = X_test.reset_index(drop=True)
        self.tokenizer = BertTokenizer.from_pretrained(vocab_path, do_lower_case=True)
        self.label_dict = label_dict
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        sentence = self.df.loc[index, 'text']
        label = label_dict[self.df.loc[index, 'sentiment']]
        tokens = self.tokenizer.tokenize(sentence)
        
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)
       
        return tokens_ids_tensor,  label

In [0]:
train_dataset = SentimentDataset('/content/drive/My Drive/sentiment/train.json', label_dict)
test_dataset = SentimentDataset('/content/drive/My Drive/sentiment/train.json', label_dict, suftrain=False)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [0]:
def pad_sequences(x, max_len):
    if len(x) <max_len:
        padded = list(x.numpy()) + [0]*(max_len - len(x))
    else:
        padded = list(x.numpy())[:(max_len-1)] + [102]
    return padded    


class Collator(object):
    def __init__(self, percentile=100):
        self.percentile = percentile
        
    def __call__(self, batch):

        
        #inp, targ, inp3, lens, lens_targ
        
        inp, targ = zip(*batch)
        
        lens = [len(x) for x in inp] 
        max_len = int(np.percentile(lens, self.percentile))
        if max_len > 400:
          max_len=400
        inp = torch.from_numpy(np.array([pad_sequences(sentence, max_len) for sentence in inp]))
        
        targ = torch.tensor(targ)
        
        return inp, targ

In [0]:
#settings
batch_size = 12
epochs = 3
learning_rate = 0.00002

In [0]:
collate = Collator(percentile=100)

In [0]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=collate)

In [0]:
class ModelRusNewsSentiment(nn.Module):
    def __init__(self):
        super(ModelRusNewsSentiment, self).__init__()
        
        self.Rubert = BertModel.from_pretrained(checkpoint_path, config=config_path) 
        self.Dropout = nn.Dropout(0.1)
        self.FC = nn.Linear(768, 3)
        

        
    def forward(self,seq):
        attn_mask = (seq != 0).long()
        out, _ = self.Rubert(seq, attention_mask = attn_mask)
        cls = out[:, 0]
        logits = self.FC(cls)
        return logits

In [0]:
model = ModelRusNewsSentiment()

In [0]:
loss = nn.CrossEntropyLoss(reduction = 'sum')

In [0]:
def evaluate(model, loss, test_loader):
    losses, accuraces = 0.0, 0.0
    model.eval()
    
    quantity = 0


    
    with torch.no_grad():
      for (seq, label) in test_loader:
          
          quantity += len(seq)
          seq, label = seq.cuda(), label.cuda()
          val_logits = model (seq)
          val_loss = loss(val_logits, label)
          losses += val_loss
          pred = torch.argmax(val_logits, dim=1)
          accuraces += sum((pred == label)*1)
         
    return losses/quantity, accuraces/quantity
            

In [0]:
def train(model, train_loader, test_loader,  epochs, loss):
    model.cuda()
    model.train()
 
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3)

    for epoch in range(epochs):
    
        total_loss = 0.0
        accuracy = 0.0
        quantity = 0.0
        for  (seq,  label) in train_loader:
            optimizer.zero_grad()
       
            quantity += len(seq)
            seq, label = seq.cuda(), label.cuda()          

            logits = model(seq)

            train_loss = loss(logits, label)
            train_loss.backward()
            total_loss = total_loss + train_loss
            
            pred = torch.argmax(logits, dim=1)
            accuracy += sum((pred == label)*1)
            

            nn.utils.clip_grad_norm_(model.parameters(),0.5)

            optimizer.step()


        val_loss, val_acc = evaluate(model, loss, test_loader)
        scheduler.step()
        print(f'Epoch {epoch}, Train_loss: {total_loss/quantity},Train_accuracy: {accuracy/quantity}')
        print(f'Epoch {epoch}, Val_loss: {val_loss},Val_accuracy: {val_acc}')

    return model

In [0]:
model = train(model, train_loader, test_loader,  epochs, loss)

Epoch 0, Train_loss: 0.8048166632652283,Train_accuracy: 0.6127272844314575
Epoch 0, Val_loss: 0.7122477293014526,Val_accuracy: 0.6745741963386536
Epoch 1, Train_loss: 0.5657979846000671,Train_accuracy: 0.7478787899017334
Epoch 1, Val_loss: 0.6435496807098389,Val_accuracy: 0.7165449857711792
Epoch 2, Train_loss: 0.41886213421821594,Train_accuracy: 0.8298484683036804
Epoch 2, Val_loss: 0.6916881799697876,Val_accuracy: 0.7080292105674744
