In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModel, AutoTokenizer

import nlpaug.augmenter.word as naw

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import torch
import random
import numpy as np

def seed_everything(seed):
    global SEED
    SEED = seed
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# def train(model, train_dataset, val_dataset=None, 
#           epochs=16, batch_size=(64, None), criterion=nn.MSELoss(), 
#           lr=(1e-3, 1e-6), warmup_epochs=0.0, weight_decay=0.01, grad_accum_steps=1, clip_grad_norm=None, 
#           metrics=None, device=None):
#     torch.cuda.empty_cache()
#     gc.collect()

#     if not device:
#         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model = model.to(device)

#     train_dataloader = DataLoader(train_dataset, batch_size=batch_size[0], shuffle=True)
#     if val_dataset:
#         val_dataloader = DataLoader(val_dataset, batch_size=batch_size[1], shuffle=False)
#     if warmup_epochs:
#         if len(lr) != 3: raise ValueError('If warmup is set, lr should contain 3 values.')
#         optimizer = optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=lr[1], weight_decay=weight_decay)
#         warmup_iters = int(len(train_dataloader) * warmup_epochs)
#         warmup = optim.lr_scheduler.LinearLR(optimizer, start_factor=lr[0] / lr[1], total_iters=warmup_iters)
#         cosine = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataloader) * epochs - warmup_iters, lr[2])
#         scheduler = optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_iters])
#         print(f'Warmup set to {warmup_iters} batches ({(warmup_iters / grad_accum_steps):.2f} steps).')
#     else:
#         optimizer = optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=lr[0], weight_decay=weight_decay)
#         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataloader) * epochs, lr[1])
#     if val_dataset:
#         best_loss = np.inf
#         best_model = deepcopy(model.cpu()).to(device)
#         model = model.to(device)
#     for epoch in range(epochs):
#         print(f'Epoch {epoch + 1}/{epochs}')
#         train_losses = []
#         pb = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
#         if metrics:
#             trues = []
#             preds = []
#         model.train()
#         optimizer.zero_grad()
#         for step_idx, (x, y) in pb:
#             for i in range(len(x)):
#                 x[i] = x[i].to(device)
#             y = y.to(device)
#             pred = model._forward(*x)
#             train_loss = criterion(pred, y)
#             (train_loss / grad_accum_steps).backward()
#             grads = [p.grad.data.abs().mean().item() for p in model.parameters() if p.grad is not None]
#             mean_grad_norm = np.mean(grads) / ((step_idx % grad_accum_steps) + 1) if len(grads) > 0 else np.nan
#             current_lr = f'{optimizer.param_groups[0]["lr"]:.3e}'.replace("e+0", "e+").replace("e-0", "e-")
#             if ((step_idx + 1) % grad_accum_steps == 0) or ((step_idx + 1) == len(train_dataloader)):
#                 if clip_grad_norm:
#                     torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad_norm)
#                 optimizer.step()
#                 optimizer.zero_grad()
#             scheduler.step()
#             train_losses.append(train_loss.item())
#             if metrics:
#                 trues += y.tolist()
#                 with torch.no_grad():
#                     preds += model(*x).tolist()
#             pb.set_description(f'Train      | loss={np.mean(train_losses):.4f} | current_lr={current_lr} | step_mean_grad_norm={mean_grad_norm:.6f}')
#         if metrics:
#             metrics_info = ''
#             for metric_name, metric in metrics.items():
#                 metrics_info += f'{metric_name}={metric(trues, preds):.4f}; '
#             print('Metrics:', metrics_info[:-2])
#         if val_dataset:
#             model.eval()
#             with torch.no_grad():
#                 val_losses = []
#                 if metrics:
#                     trues = []
#                     preds = []
#                 pb = tqdm(val_dataloader)
#                 for (x, y) in pb:
#                     for i in range(len(x)):
#                         x[i] = x[i].to(device)
#                     y = y.to(device)
#                     pred = model._forward(*x)
#                     val_loss = criterion(pred, y)
#                     val_losses.append(val_loss.item())
#                     if metrics:
#                         trues += y.tolist()
#                         pred = model(*x)
#                         preds += pred.tolist()
#                     pb.set_description(f'Validation | loss={np.mean(val_losses):.4f}')
#                 if metrics:
#                     metrics_info = ''
#                     for metric_name, metric in metrics.items():
#                         metrics_info += f'{metric_name}={metric(trues, preds):.4f}; '
#                     print('Metrics:', metrics_info[:-2])
#                 if np.mean(val_losses) < best_loss:
#                     best_loss = np.mean(val_losses)
#                     best_model = deepcopy(model.cpu()).to(device)
#                     model = model.to(device)
#         print()
    
#     torch.cuda.empty_cache()
#     gc.collect()

#     if val_dataset:
#         return best_model

In [5]:
def train(model, train_dataset, val_dataset=None, 
          epochs=16, batch_size=(64, None), train_sampler=None, criterion=nn.MSELoss(), 
          lr=(1e-3, 1e-6), warmup_epochs=0.0, weight_decay=0.01, grad_accum_steps=1, clip_grad_norm=None, 
          metrics=None, device=None):
    torch.cuda.empty_cache()
    gc.collect()

    if not device:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    if train_sampler is not None:
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size[0], sampler=train_sampler)
    else:
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size[0], shuffle=True)
    if val_dataset:
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size[1], shuffle=False)
    if warmup_epochs:
        if len(lr) != 3: raise ValueError('If warmup is set, lr should contain 3 values.')
        optimizer = optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=lr[1], weight_decay=weight_decay)
        warmup_iters = int(len(train_dataloader) * warmup_epochs)
        warmup = optim.lr_scheduler.LinearLR(optimizer, start_factor=lr[0] / lr[1], total_iters=warmup_iters)
        cosine = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataloader) * epochs - warmup_iters, lr[2])
        scheduler = optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_iters])
        print(f'Warmup set to {warmup_iters} batches ({(warmup_iters / grad_accum_steps):.2f} steps).')
    else:
        optimizer = optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=lr[0], weight_decay=weight_decay)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataloader) * epochs, lr[1])
    if val_dataset:
        best_loss = np.inf
        best_model = deepcopy(model.cpu()).to(device)
        model = model.to(device)
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        train_losses = []
        pb = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
        if metrics:
            trues = []
            preds = []
        model.train()
        optimizer.zero_grad()
        for step_idx, (x, y) in pb:
            for i in range(len(x)):
                x[i] = x[i].to(device)
            y = y.to(device)
            pred = model._forward(*x)
            train_loss = criterion(pred, y)
            (train_loss / grad_accum_steps).backward()
            grads = [p.grad.data.abs().mean().item() for p in model.parameters() if p.grad is not None]
            mean_grad_norm = np.mean(grads) / ((step_idx % grad_accum_steps) + 1) if len(grads) > 0 else np.nan
            current_lr = f'{optimizer.param_groups[0]["lr"]:.3e}'.replace("e+0", "e+").replace("e-0", "e-")
            if ((step_idx + 1) % grad_accum_steps == 0) or ((step_idx + 1) == len(train_dataloader)):
                if clip_grad_norm:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad_norm)
                optimizer.step()
                optimizer.zero_grad()
            scheduler.step()
            train_losses.append(train_loss.item())
            if metrics:
                trues += y.tolist()
                with torch.no_grad():
                    preds += model(*x).tolist()
            pb.set_description(f'Train      | loss={np.mean(train_losses):.4f} | current_lr={current_lr} | step_mean_grad_norm={mean_grad_norm:.6f}')
        if metrics:
            metrics_info = ''
            for metric_name, metric in metrics.items():
                metrics_info += f'{metric_name}={metric(trues, preds):.4f}; '
            print('Metrics:', metrics_info[:-2])
        if val_dataset:
            model.eval()
            with torch.no_grad():
                val_losses = []
                if metrics:
                    trues = []
                    preds = []
                pb = tqdm(val_dataloader)
                for (x, y) in pb:
                    for i in range(len(x)):
                        x[i] = x[i].to(device)
                    y = y.to(device)
                    pred = model._forward(*x)
                    val_loss = criterion(pred, y)
                    val_losses.append(val_loss.item())
                    if metrics:
                        trues += y.tolist()
                        pred = model(*x)
                        preds += pred.tolist()
                    pb.set_description(f'Validation | loss={np.mean(val_losses):.4f}')
                if metrics:
                    metrics_info = ''
                    for metric_name, metric in metrics.items():
                        metrics_info += f'{metric_name}={metric(trues, preds):.4f}; '
                    print('Metrics:', metrics_info[:-2])
                if np.mean(val_losses) < best_loss:
                    best_loss = np.mean(val_losses)
                    best_model = deepcopy(model.cpu()).to(device)
                    model = model.to(device)
        print()
    
    torch.cuda.empty_cache()
    gc.collect()

    if val_dataset:
        return best_model

In [6]:
import re
def list_replace(search, replacement, text):
    """
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    """
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):

    text = list_replace \
        ('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)

    text = list_replace \
        ('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)

    text = list_replace('\u2010\u2011', '\u002D', text)

    text = list_replace \
            (
            '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
            '\u2002', text)

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace \
            (
            '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
            '.', text)

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?"\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list \
            (
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
        )

    alphabet = list \
            (
            '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text

In [7]:
train_df = pd.read_csv('train.tsv', sep=',', encoding="utf-8")
train_df['tweet'] = train_df['tweet'].apply(lambda x: ' '.join(clean_text(x).lower().split()))
train_df

Unnamed: 0,id,tweet,class
0,760402871867367424,настало время для ингаляторов дружок сальбутам...,0
1,1035908416869462016,xx на прошлой зимней олимпиаде большинство лыж...,1
2,1089839736427032577,не соглашусь с заменой зок на метопролол в так...,0
3,779671488748224513,dixmx мезим смекта если отравление то лоперамид,0
4,738309299756240897,уберите микроволновки и имодиум действуют соул...,0
...,...,...,...
9510,669973915456925697,поставка лекарственных препаратов мнн формотер...,0
9511,1126889334735626240,единственная радость моей жизни прозак,0
9512,1054522033151848449,anatoliisharii x героин класс a x кокаин класс...,0
9513,1112821163774918656,xx лет девочки разноцветные витаминки xx лет д...,0


In [8]:
train_df['class'].value_counts(normalize=True).sort_index()

class
0    0.912559
1    0.087441
Name: proportion, dtype: float64

In [9]:
class_weights = ((1 / train_df['class'].value_counts(normalize=True).sort_index()) ** (1 / 8)).to_numpy()
class_weights = class_weights * len(class_weights) / np.sum(class_weights)
class_weights

array([0.85446024, 1.14553976])

In [10]:
infreqs = (1 / train_df['class'].value_counts(normalize=True).sort_index()).to_numpy()
infreqs

array([ 1.09581942, 11.43629808])

In [11]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length=128, 
                 augmenter=None, aug_prob=0.2):
        self.texts = list(texts)
        self.targets = list(targets) if targets is not None else None
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augmenter = augmenter
        self.aug_prob = aug_prob

    def __len__(self):
        return len(self.texts)

    def _apply_augmentation(self, text):
        try:
            augmented = self.augmenter.augment(text)
            return augmented[0] if isinstance(augmented, list) else augmented
        except Exception as e:
            print(f"Error in augmentation: {e}")
        return text

    def __getitem__(self, idx):
        text = self.texts[idx]
        if self.augmenter and random.random() < self.aug_prob:
            text = self._apply_augmentation(text)

        encoding = self.tokenizer(
            text, 
            padding="max_length",
            truncation=True, 
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        target = torch.tensor(self.targets[idx], dtype=torch.long) if self.targets else torch.nan

        return [[input_ids, attention_mask], target]

In [12]:
class BERTClassifier(nn.Module):
    def __init__(self, backbone_model):
        super().__init__()
        self.bert = AutoModel.from_pretrained(backbone_model)
        self.dropout = nn.Dropout(0.4)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = x.pooler_output
        x = self.dropout(x)
        x = self.regressor(x)
        return x

    def _forward(self, input_ids, attention_mask):
        return self.forward(input_ids=input_ids, attention_mask=attention_mask)

In [13]:
def acc(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_pred = np.argmax(y_pred, axis=-1)
    return accuracy_score(y_true, y_pred)

def f1(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_pred = np.argmax(y_pred, axis=-1)
    return f1_score(y_true, y_pred, average='macro')

In [14]:
backbone_model = 'ai-forever/ruBert-base'

tokenizer = AutoTokenizer.from_pretrained(backbone_model)
model = BERTClassifier(backbone_model).to(device)

In [15]:
df_texts = train_df[['tweet', 'class']].copy().dropna()

train_texts, val_texts, train_targets, val_targets = \
    train_test_split(df_texts['tweet'], df_texts['class'].values,
                     test_size=0.1, shuffle=True, random_state=SEED)

augmenter = naw.ContextualWordEmbsAug(
    model_path='ai-forever/ruBert-base',
    action='substitute',
    aug_p=0.1,
    device='cuda',
)

train_dataset = TextClassificationDataset(train_texts, train_targets, tokenizer, augmenter=augmenter, aug_prob=0.1)
val_dataset = TextClassificationDataset(val_texts, val_targets, tokenizer)

samples_weights = pd.Series(train_dataset.targets).apply(lambda x: infreqs[x]).to_numpy()

train_sampler = WeightedRandomSampler(
    weights=samples_weights,
    num_samples=len(train_dataset),
    replacement=True
)

In [16]:
train_dataset.texts[0]

'надо и юле предложить сдать анализы в евролаб у неё там давние связи тамифлю в украине продвигали а если серьезно то это самый реалистический сценарий она очень подозрительно смирилась со своим третьим местом как говорится без суда и следствия наверняка подстава'

In [17]:
train_dataset._apply_augmentation(train_dataset.texts[0])

'надо и ему предложить сдать анализы в евролаб у нее там давние связи тамифлю в украине но а если серьезно то это самыи реалистическии бы она как подозрительно смирилась со своим физическим состоянием как говорится без суда и следствия наверняка подстава'

In [18]:
class WeightedLabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.01, class_weights=None):
        super().__init__()
        self.smoothing = smoothing
        self.class_weights = torch.tensor(class_weights, dtype=torch.float) if class_weights is not None else None

    def forward(self, x, target):
        log_probs = F.log_softmax(x, dim=-1)
        num_classes = x.size(-1)

        # NLL Loss
        nll_loss = -log_probs.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1)

        # Smooth Loss (original label smoothing)
        smooth_target = torch.full_like(log_probs, self.smoothing / (num_classes - 1))
        smooth_target.scatter_(-1, target.unsqueeze(-1), 1 - self.smoothing)
        smooth_loss = - (log_probs * smooth_target).sum(dim=-1)

        # Combine losses
        loss = (1 - self.smoothing) * nll_loss + self.smoothing * smooth_loss

        # Apply class weights
        if self.class_weights is not None:
            device = x.device
            weights = self.class_weights.to(device)[target]
            loss *= weights

        # Normalize
        if self.class_weights is not None:
            return loss.sum() / weights.sum()
        else:
            return loss.mean()

In [19]:
model = train(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    epochs=8,
    batch_size=(16, 16),
    train_sampler=train_sampler,
    criterion=WeightedLabelSmoothingCrossEntropy(smoothing=0.01, class_weights=class_weights),
    lr=(1e-8, 1e-5, 1e-6),
    warmup_epochs=0.25,
    weight_decay=1e-2,
    grad_accum_steps=4,
    clip_grad_norm=1.0,
    metrics={'acc': acc, 'f1-macro': f1},
    device=device
)

torch.save(model.state_dict(), 'checkpoints.pth')
model.bert.save_pretrained('checkpoints')
with open('checkpoints.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

Warmup set to 134 batches (33.50 steps).
Epoch 1/8


Train      | loss=0.4793 | current_lr=9.795e-6 | step_mean_grad_norm=0.000321: 100%|██████████| 536/536 [01:27<00:00,  6.15it/s]


Metrics: acc=0.7440; f1-macro=0.7408


Validation | loss=0.5132: 100%|██████████| 60/60 [00:03<00:00, 17.04it/s]


Metrics: acc=0.7931; f1-macro=0.6253

Epoch 2/8


Train      | loss=0.2310 | current_lr=8.917e-6 | step_mean_grad_norm=0.000220: 100%|██████████| 536/536 [01:28<00:00,  6.09it/s]


Metrics: acc=0.9125; f1-macro=0.9124


Validation | loss=0.4934: 100%|██████████| 60/60 [00:03<00:00, 16.91it/s]


Metrics: acc=0.8330; f1-macro=0.6558

Epoch 3/8


Train      | loss=0.1489 | current_lr=7.485e-6 | step_mean_grad_norm=0.000446: 100%|██████████| 536/536 [01:27<00:00,  6.10it/s]


Metrics: acc=0.9507; f1-macro=0.9507


Validation | loss=0.3550: 100%|██████████| 60/60 [00:03<00:00, 16.69it/s]


Metrics: acc=0.8897; f1-macro=0.6857

Epoch 4/8


Train      | loss=0.1016 | current_lr=5.731e-6 | step_mean_grad_norm=0.000303: 100%|██████████| 536/536 [01:26<00:00,  6.18it/s]


Metrics: acc=0.9686; f1-macro=0.9686


Validation | loss=0.3779: 100%|██████████| 60/60 [00:03<00:00, 16.95it/s]


Metrics: acc=0.9055; f1-macro=0.7030

Epoch 5/8


Train      | loss=0.0861 | current_lr=3.940e-6 | step_mean_grad_norm=0.000210: 100%|██████████| 536/536 [01:27<00:00,  6.15it/s]


Metrics: acc=0.9717; f1-macro=0.9717


Validation | loss=0.3991: 100%|██████████| 60/60 [00:03<00:00, 16.97it/s]


Metrics: acc=0.9002; f1-macro=0.6980

Epoch 6/8


Train      | loss=0.0753 | current_lr=2.402e-6 | step_mean_grad_norm=0.001748: 100%|██████████| 536/536 [01:27<00:00,  6.11it/s]


Metrics: acc=0.9777; f1-macro=0.9777


Validation | loss=0.4125: 100%|██████████| 60/60 [00:03<00:00, 16.97it/s]


Metrics: acc=0.9034; f1-macro=0.6931

Epoch 7/8


Train      | loss=0.0630 | current_lr=1.366e-6 | step_mean_grad_norm=0.000486: 100%|██████████| 536/536 [01:26<00:00,  6.19it/s]


Metrics: acc=0.9815; f1-macro=0.9815


Validation | loss=0.4366: 100%|██████████| 60/60 [00:03<00:00, 16.82it/s]


Metrics: acc=0.9023; f1-macro=0.7012

Epoch 8/8


Train      | loss=0.0583 | current_lr=1.000e-6 | step_mean_grad_norm=0.000052: 100%|██████████| 536/536 [01:26<00:00,  6.19it/s]


Metrics: acc=0.9841; f1-macro=0.9841


Validation | loss=0.4315: 100%|██████████| 60/60 [00:03<00:00, 16.81it/s]


Metrics: acc=0.9107; f1-macro=0.7149



In [20]:
test_df = pd.read_csv('test.tsv', sep=',', encoding="utf-8")
test_df['tweet'] = test_df['tweet'].apply(lambda x: ' '.join(clean_text(x).lower().split()))
test_df

Unnamed: 0,id,tweet
0,1200838666136018946,о возможно терапии баклофеном алкоголизма с эф...
1,1202167857615912961,homkanizli начало развиваться привыкание к сал...
2,1202511585744498690,soloxxxxxxxxxx используем беродуал в комплексе...
3,1202519545501667334,bunedemon у меня тоже были ады ламотриджин но ...
4,1200503613023555584,мне кажется мне уже разряд по художественной н...
...,...,...
1499,1135162030636355585,аноны срочно требуется помощь пожалуйста кто е...
1500,1137990280483868673,дети и подростки с сдвг которым были назначены...
1501,1050945039189401605,однажды поднимаясь на гору я думала что ну уж ...
1502,808645125115117568,мое новогоднее настроение застряло между турбу...


In [21]:
test_dataset = TextClassificationDataset(test_df['tweet'], None, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [22]:
preds = []
model.eval()
with torch.no_grad():
    for (x, y) in tqdm(test_dataloader, desc='Processing...'):
        for i in range(len(x)):
            x[i] = x[i].to(device)
        ans = model(*x)
        preds += ans.tolist()
preds = np.array(preds)
preds = np.argmax(preds, axis=-1)
preds

Processing...: 100%|██████████| 188/188 [00:03<00:00, 61.95it/s]


array([0, 1, 0, ..., 0, 0, 0])

In [23]:
test_df['class'] = preds
test_df

Unnamed: 0,id,tweet,class
0,1200838666136018946,о возможно терапии баклофеном алкоголизма с эф...,0
1,1202167857615912961,homkanizli начало развиваться привыкание к сал...,1
2,1202511585744498690,soloxxxxxxxxxx используем беродуал в комплексе...,0
3,1202519545501667334,bunedemon у меня тоже были ады ламотриджин но ...,0
4,1200503613023555584,мне кажется мне уже разряд по художественной н...,0
...,...,...,...
1499,1135162030636355585,аноны срочно требуется помощь пожалуйста кто е...,1
1500,1137990280483868673,дети и подростки с сдвг которым были назначены...,1
1501,1050945039189401605,однажды поднимаясь на гору я думала что ну уж ...,0
1502,808645125115117568,мое новогоднее настроение застряло между турбу...,0


In [24]:
test_df[['id', 'class']].to_csv('submission.csv', index=False)