In [1]:
import google.colab

DEVICE = 'cuda'
num_workers = 2
BATCH_SIZE = 6
ROOT = '/content/'
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!cp /content/drive/MyDrive/kursk_2022/kursk.zip /content/
!unzip -q /content/kursk.zip

!python -m pip install --upgrade pip
!pip install -U transformers beautifulsoup4

print('DEVICE = ', DEVICE, 'BATCH_SIZE =', BATCH_SIZE)
!nvidia-smi

Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.1 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-h

## Загружаем неоходимые библиотеки

In [2]:
import os
import copy
import math
import random
from datetime import datetime

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel

DEVICE     = 'cuda'
MODEL_NAME = 'sberbank-ai/sbert_large_mt_nlu_ru'
SEED       = 1
MAX_LEN    = 384
VERSION    = '2207-cat3-rocauc'

os.environ["TOKENIZERS_PARALLELISM"] = "false"

scaler = torch.cuda.amp.GradScaler()

## Подгружаем предобученный токенайзер и языкову модель

In [3]:
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/331 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

## Зафиксируем генератор случайных чисел

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark     = False

seed_everything(SEED)

## Считываем тренировочный датасет

In [5]:
df = pd.read_csv(ROOT + "train_dataset_train.csv")
df['text'] = df['Текст Сообщения'].apply(lambda x: BeautifulSoup(x).text.replace('\xa0', ' '))
df = df[['id', 'text', 'Тематика', 'Категория']]

df.head(3)



Unnamed: 0,id,text,Тематика,Категория
0,2246,Помогите начальник Льговского рэс не реагирует...,"Нарушения, связанные с содержанием электросети...",3
1,380,По фасаду дома по адресу ул. Урицкого 22 прохо...,Аварийные деревья,3
2,2240,Агресивные собаки. На радуге там стая из подро...,Безнадзорные животные,1


In [6]:
NUM_CLASSES = df['Категория'].nunique()
df['Категория'].value_counts()

3     954
0     478
16    149
8     139
4     108
10     48
7      27
1      25
11     19
5      12
13     11
6      10
15      7
9       5
14      4
2       3
12      1
Name: Категория, dtype: int64

## Отбросим 12 класс, т.к. там только один элемент

In [7]:
df.drop(index=df[df['Категория'] == 12].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Загружаем дополнительно размеченные категории

In [8]:
categories = pd.read_csv(ROOT + 'subcategories_2.csv')
categories.head(10)

Unnamed: 0,Категория,Подкатегория по сути проблемы,Подкатегория по области действия,Тема
0,Дорожное покрытие,Ямы,Городские округа и сельские поселения,Нарушение дорожного покрытия (ямы) на дорогах ...
1,Дорожное покрытие,Отсутствие твёрдого дорожного покрытия,Городские округа и сельские поселения,Отсутствие твёрдого дорожного покрытия на доро...
2,Дорожное покрытие,Ямы,Многоквартирные дома,Ямы и выбоины на внутридворовых проездах и тро...
3,Дорожное покрытие,Ямы,Общее,Ямы и выбоины на тротуарах
4,Дорожное покрытие,Ямы,Дороги регионального назначения,Нарушение дорожного покрытия (ямы) на дорогах ...
5,Дорожное покрытие,Ямы,ИЖС,Нарушение дорожного покрытия (ямы) на дорогах ...
6,Дорожное покрытие,Необходимо строительство,Общее,Необходимо строительство тротуара
7,Дорожное покрытие,Искуственные неровности,Городские округа и сельские поселения,Искусственные неровности на дорогах в границах...
8,Дорожное покрытие,Отсутствие твёрдого дорожного покрытия,ИЖС,Отсутствие твёрдого дорожного покрытия на доро...
9,Дорожное покрытие,Необходимо строительство,ИЖС,Необходимо строительство тротуара в микрорайон...


In [9]:
cat_ids = {k:v + 1 for v, k in enumerate(categories['Подкатегория по сути проблемы'].unique())}
print(f'total {len(cat_ids)} themes')
themes = {v:cat_ids[k] for k, v in categories[['Подкатегория по сути проблемы', 'Тема']].itertuples(index=False)}

df['themes'] = df['Тематика'].map(themes.get).fillna(0).astype(int)

total 73 themes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


## Закодируем категориальные признаки

In [10]:
df = df[['id', 'text', 'Категория', 'themes']].copy()

enc = OneHotEncoder(sparse=False)
enc.fit(np.arange(df['Категория'].nunique() + 1).reshape(-1, 1))

df.head()

Unnamed: 0,id,text,Категория,themes
0,2246,Помогите начальник Льговского рэс не реагирует...,3,32
1,380,По фасаду дома по адресу ул. Урицкого 22 прохо...,3,26
2,2240,Агресивные собаки. На радуге там стая из подро...,1,27
3,596,На пересечении улиц Сосновская и Береговая з...,3,53
4,1797,Здравствуйте! Рядом с домом 1 «А» по улице Све...,3,26


## Опишем классы датасета и нашей модели. Основная идея: мы будем учить модель определять и верную категорию и предразмеченную тему.

In [11]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['Категория'].values
            self.themes  = dataframe['themes'].values
        texts = list(dataframe['text'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
            item['themes'] = torch.tensor(self.themes[idx]).long()
        return item
    
    def __len__(self):
        return len(self.dataframe)


class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight)).float()
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)

        if label is None:
            return phi
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=DEVICE)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output


class Model(nn.Module):
    def __init__(self, 
                 bert_model, 
                 num_classes=NUM_CLASSES, 
                 themes=76,
                 last_hidden_size=1024):
        
        super().__init__()
        self.bert_model = bert_model
        self.arc_margin = ArcMarginProduct(last_hidden_size, 
                                           num_classes, 
                                           s=30.0, 
                                           m=0.50, 
                                           easy_margin=False)
        self.arc_margin2 = ArcMarginProduct(last_hidden_size, 
                                           themes, 
                                           s=30.0, 
                                           m=0.50, 
                                           easy_margin=False)
    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        output = self.arc_margin(CLS_hidden_state, batch.get('labels', None))
        themes = self.arc_margin2(CLS_hidden_state, batch.get('themes', None))
        return output, themes

## Функция для обучения одну эпоху

In [12]:
def one_epoch(model, 
              criterion,
              criteriont,
              loader,
              optimizer=None, 
              mode="train"):
    
    preds_ = []
    trues_ = []
    loss_  = []
    batch_ = []
    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.cuda.amp.autocast(enabled=True):
            preds, predst = model(batch)
            loss = criterion(preds, batch['labels'])
            losst = criteriont(predst, batch['themes'])
        
        loss = loss * 0.8 + losst * 0.2
        if mode == "train":
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        loss_.append(loss.item() * len(batch))
        batch_.append(len(batch))
        preds_.extend(preds.detach().argmax(dim=1).cpu().numpy().tolist())
        trues_.extend(batch['labels'].cpu().numpy().flatten().tolist())

    unique_true = set(trues_)
    for x in [x for x in range(17) if x not in unique_true]:
        trues_.append(x)
        preds_.append(x)

    y_pred = np.asarray(preds_).reshape(-1, 1).tolist()
    y_pred = enc.transform(y_pred)
    rocauc = roc_auc_score(trues_, y_pred, multi_class='ovo', average='weighted')

    return np.sum(loss_) / np.sum(batch_), rocauc

## Начинаем обучение 50 эпох

In [13]:
seed_everything(SEED)

train_df, valid_df = train_test_split(df, 
                                  test_size=0.10, 
                                  shuffle=True, 
                                  random_state=SEED, 
                                  stratify=df['Категория'])

train_dataset = TextDataset(train_df, tokenizer, max_length=MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=num_workers, shuffle=True, drop_last=True)

valid_dataset = TextDataset(valid_df, tokenizer, max_length=MAX_LEN)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, num_workers=num_workers, shuffle=False, drop_last=False)

torch.cuda.empty_cache()

model = Model(bert_model).to(DEVICE)
criterion = nn.CrossEntropyLoss()
criteriont = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.8, patience=2)

best_auc = 0.0
best_model_weights = copy.deepcopy(model.state_dict())

for epoch in range(50):
    print(f"Start epoch {epoch + 1} at {datetime.now().strftime('%H:%M:%S')}")
    current_lr = optimizer.param_groups[0]['lr']

    model.train()
    train_loss, train_auc = one_epoch(model, criterion, criteriont, train_loader, optimizer=optimizer, mode="train")                     
    print(f"  train_loss={train_loss:7.4f}, roc auc={train_auc:7.4f}, lr={current_lr:.7f}")

    model.eval()
    with torch.no_grad():
        valid_loss, valid_auc = one_epoch(model, criterion, criteriont, valid_loader, 
                                          optimizer=None, mode="valid")
    print(f"  valid_loss={valid_loss:7.4f}, roc auc={valid_auc:7.4f}")

    if valid_auc > best_auc:
        best_auc = valid_auc
        best_model_weights = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), f"{MODEL_NAME.replace('/', '-')}_{VERSION}_{SEED}.pth")
        torch.save(model.state_dict(), f"/content/drive/MyDrive/kursk_2022/{MODEL_NAME.replace('/', '-')}_{VERSION}_{SEED}.pth")
        print("Saved best model!")

    lr_scheduler.step(valid_auc)
    opt_lr = optimizer.param_groups[0]['lr']
    if current_lr != opt_lr:
        print("Loading best model weights!")
        model.load_state_dict(torch.load(f"{MODEL_NAME.replace('/', '-')}_{VERSION}_{SEED}.pth",  map_location=DEVICE))

    torch.cuda.empty_cache()

print("done")

Start epoch 1 at 06:20:41
  train_loss=12.1379, roc auc= 0.5157, lr=0.0000100
  valid_loss= 9.2617, roc auc= 0.7172
Saved best model!
Start epoch 2 at 06:24:24
  train_loss= 7.5070, roc auc= 0.6827, lr=0.0000100
  valid_loss= 8.0567, roc auc= 0.7727
Saved best model!
Start epoch 3 at 06:27:48
  train_loss= 5.8632, roc auc= 0.7558, lr=0.0000100
  valid_loss= 7.6039, roc auc= 0.8165
Saved best model!
Start epoch 4 at 06:31:15
  train_loss= 5.0384, roc auc= 0.7947, lr=0.0000100
  valid_loss= 7.5704, roc auc= 0.8023
Start epoch 5 at 06:34:25
  train_loss= 4.3045, roc auc= 0.8217, lr=0.0000100
  valid_loss= 7.1038, roc auc= 0.8227
Saved best model!
Start epoch 6 at 06:37:51
  train_loss= 3.8636, roc auc= 0.8402, lr=0.0000100
  valid_loss= 6.9706, roc auc= 0.8193
Start epoch 7 at 06:41:01
  train_loss= 3.6123, roc auc= 0.8509, lr=0.0000100
  valid_loss= 7.1227, roc auc= 0.8163
Start epoch 8 at 06:44:10
  train_loss= 3.3521, roc auc= 0.8505, lr=0.0000100
  valid_loss= 6.3576, roc auc= 0.8318


## Загружаем тестовый датасет и делаем прогноз требуемой категории. В качестве источника данных используется только &laquo;Текст Сообщения&raquo;.

In [14]:
model = Model(bert_model)
model.load_state_dict(torch.load(f"{MODEL_NAME.replace('/', '-')}_{VERSION}_{SEED}.pth", map_location='cpu'))
model.eval()
model.to(DEVICE)

df  = pd.read_csv(ROOT + 'test_dataset_test.csv')
df['text'] = df['Текст Сообщения'].apply(lambda x: BeautifulSoup(x).text.replace('\xa0', ' '))

test_dataset = TextDataset(df, tokenizer, mode="test", max_length=MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=num_workers, shuffle=False)

test_preds = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        preds = model(batch)[0]
        test_preds.extend(preds.argmax(dim=1).cpu().numpy().flatten().tolist())

df['Категория'] = test_preds

df = df[['id', 'Категория']].copy()

df.to_csv(f"/content/drive/MyDrive/kursk_2022/{MODEL_NAME.replace('/', '-')}_{VERSION}_{SEED}.csv_", index=False)

