In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel

from tqdm.auto import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
!gdown 1_MejkKWmCjS6tHJPgh2Mk_vXHpwRbyrW

Downloading...
From: https://drive.google.com/uc?id=1_MejkKWmCjS6tHJPgh2Mk_vXHpwRbyrW
To: /content/data.zip
  0% 0.00/12.4k [00:00<?, ?B/s]100% 12.4k/12.4k [00:00<00:00, 25.7MB/s]


In [None]:
!unzip data

Archive:  data.zip
  inflating: column_meaning.csv      
  inflating: sample_submission.csv   
  inflating: train_data.csv          


In [2]:
data = pd.read_csv('/content/your_file.csv')
data.shape

(154, 9)

In [3]:
quest_name = ['Название вебинара: ', ' Понравилось: ' , ' Сложные моменты: ', ' Что можно улушить: ', ' Чтобы хотелось узнать: ']

data_dict = []
target = np.zeros((154, 3))
for row in tqdm(data.iterrows()):
    i = row[0]
    row = row[1] #quest_name[0] + row.question_1 +
    s = quest_name[0] + row.question_1 + quest_name[1] + row.question_2 + quest_name[2] + row.question_3 + quest_name[3] + row.question_4 + quest_name[4] + row.question_5
    data_dict.append(s)
    target[i][0] = row.is_relevant
    target[i][1] = row.is_positive
    target[i][2] = row.object

0it [00:00, ?it/s]

In [18]:
target[:, 0].sum() / (154 - 141)

10.846153846153847

In [4]:
class BertClassification(nn.Module):
    def __init__(self, model):
        super().__init__()

        self.bert = model
        for param in self.bert.parameters():
            param.requires_grad = False
        self.inform = nn.Linear(312, 2)
        self.tonal = nn.Linear(312, 2)
        self.obj = nn.Linear(312, 3)
    def forward(self, x):
        out = self.bert(input_ids=x['input_ids'], attention_mask=x['attention_mask'])['last_hidden_state'][:, 0]
        inf = self.inform(out)
        ton = self.tonal(out)
        obj = self.obj(out)
        return inf, ton, obj


# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

# bert = BertClassification(model).to(device)

In [5]:
class Sentece_Transforms(Dataset):
    def __init__(self, sentences, target, tokenizer, device):
        inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
        self.inp, self.attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
        self.target_info = torch.LongTensor(target[:, 0]).to(device)
        self.target_ton = torch.LongTensor(target[:, 1]).to(device)
        self.target_obj = torch.LongTensor(target[:, 2]).to(device)
    def __len__(self):
        return len(self.inp)

    def __getitem__(self, idx):
        return ({'input_ids': self.inp[idx], 'attention_mask': self.attention_mask[idx]}, (self.target_info[idx], self.target_ton[idx], self.target_obj[idx]))

In [6]:
from sklearn.model_selection import train_test_split

sentence_train, sentence_test, target_train, target_test = train_test_split(data_dict, target, shuffle=True, test_size=0.1, stratify=target[:, 0])

In [7]:
BATCH_SIZE = 128


tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# model = BertModel.from_pretrained("bert-base-multilingual-cased")


bert = BertClassification(model).to(device)
# bert.load_state_dict(torch.load('/content/bert_model_weights.pth', map_location='cpu'))

dataset = Sentece_Transforms(sentence_train, target_train, tokenizer, device)
dataset_test = Sentece_Transforms(sentence_test, target_test, tokenizer, device)

train_loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset_test, shuffle=False, batch_size=BATCH_SIZE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [8]:
optimizer = Adam(bert.parameters(), lr=0.0001)
weights = torch.tensor([10., 1]).to(device)
loss_fn1 = CrossEntropyLoss(weight=weights)
loss_fn2 = CrossEntropyLoss()
loss_fn3 = CrossEntropyLoss()

In [None]:
epoches = 300

from sklearn.metrics import f1_score


ton_list, inf_list, sum_list, obj_list = [], [], [], []
ton_list_test, inf_list_test, sum_list_test, obj_list_test = [], [], [], []

for epoch in tqdm(range(epoches)):
    ton_ls_test = inf_ls_test = sum_ls_test = obj_ls_test = 0
    inf_ls = ton_ls = sum_ls = obj_ls = 0
    for batch in train_loader:
        optimizer.zero_grad()

        x, inf_y, ton_y, obj_y= batch[0], batch[1][0], batch[1][1], batch[1][2]
        inf, ton, obj = bert(x)

        loss1 = loss_fn1(inf, inf_y)
        inf_ls += loss1.item()

        loss2 = loss_fn2(ton, ton_y)
        ton_ls += loss2.item()

        loss3 = loss_fn3(obj, obj_y)
        obj_ls += loss3.item()

        loss = loss1 + loss2 + loss3

        sum_ls += loss.item()
        loss.backward()
        optimizer.step()



    ton_list.append(ton_ls / len(train_loader))
    inf_list.append(inf_ls / len(train_loader))
    obj_list.append(obj_ls / len(train_loader))
    sum_list.append(sum_ls / len(train_loader))

    if epoch % 20 == 0:

        obj_pred = obj.argmax(dim=1).cpu()
        ton_pred = ton.argmax(dim=1).cpu()
        inf_pred = inf.argmax(dim=1).cpu()

        obj_f = f1_score(obj_y.cpu().detach(), obj_pred.detach(), average='micro')
        ton_f = f1_score(ton_y.cpu().detach(), ton_pred.detach())
        inf_f = f1_score(inf_y.cpu().detach(), inf_pred.detach())
        print(f'Train: {obj_f}, {ton_f}, {inf_f}')




    bert.eval()

    with torch.no_grad():
        for batch in test_loader:

            x, inf_y, ton_y, obj_y= batch[0], batch[1][0], batch[1][1], batch[1][2]

            inf, ton, obj = bert(x)

            loss1 = loss_fn1(inf, inf_y)
            inf_ls_test += loss1.item()

            loss2 = loss_fn2(ton, ton_y)
            ton_ls_test += loss2.item()

            loss3 = loss_fn3(obj, obj_y)
            obj_ls_test += loss3.item()

            loss = loss1 + loss2 + loss3
            sum_ls_test += loss

        ton_list_test.append(ton_ls_test / len(test_loader))
        inf_list_test.append(inf_ls_test / len(test_loader))
        obj_list_test.append(obj_ls_test / len(test_loader))
        sum_list_test.append(sum_ls_test / len(test_loader))

    if epoch % 20 == 0:
        print(f'Train: Ton Loss={ton_list[-1]}, Inf Loss={inf_list[-1]}, Total Loss={sum_list[-1]} || Test: Ton Loss={ton_list_test[-1]}, Inf Loss={inf_list_test[-1]}, Total Loss={sum_list_test[-1]}')
        obj_pred = obj.argmax(dim=1).cpu()
        ton_pred = ton.argmax(dim=1).cpu()
        inf_pred = inf.argmax(dim=1).cpu()

        obj_f = f1_score(obj_y.cpu().detach(), obj_pred.detach(), average='micro')
        ton_f = f1_score(ton_y.cpu().detach(), ton_pred.detach())
        inf_f = f1_score(inf_y.cpu().detach(), inf_pred.detach())
        print(f'Test: {obj_f}, {ton_f}, {inf_f}')

In [11]:
torch.save(bert.state_dict(), 'bert2_model.pth')

In [None]:
obj_pred = obj.argmax(dim=1).cpu()
ton_pred = ton.argmax(dim=1).cpu()
inf_pred = inf.argmax(dim=1).cpu()

obj_f = f1_score(obj_y.cpu().detach(), obj_pred.detach(), average='micro')
ton_f = f1_score(ton_y.cpu().detach(), ton_pred.detach())
inf_f = f1_score(inf_y.cpu().detach(), inf_pred.detach())

In [None]:
obj_f, ton_f, inf_f

(0.75, 0.9285714285714286, 1.0)