In [1]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
from torch.utils.data import DataLoader
from tqdm import trange
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('./Corrupted Data/total_data_for_corrupt_classif_noBR.csv', index_col=['Unnamed: 0'])
df = df.reset_index()
df = df.drop(columns=['index'])

In [5]:
df.head()

Unnamed: 0,Text,Label
0,"О человеки все цветов! ― Сказал, зевая, Саваоф...",0
1,"В Афинее осторожно Свиток разверня, Весь проч...",0
2,"Серо-черной, не очень суровой зимою в низкорос...",0
3,"Всё сохраню, всё пронесу, ― И вечность, что о...",0
4,"Приятности твои на мысли вображая, В пустынях...",0


In [6]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=22)

In [7]:
train_df = train_df.sample(frac=1)
val_df = val_df.sample(frac=1)

In [8]:
print(len(train_df), len(val_df))

178075 31425


In [9]:
len(train_df[train_df['Label']==1])

88830

In [10]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [11]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")

def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Downloading tokenizer_config.json: 100%|██████████| 341/341 [00:00<00:00, 64.0kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading config.json: 100%|██████████| 632/632 [00:00<00:00, 90.0kB/s]
Downloading vocab.txt: 100%|██████████| 241k/241k [00:00<00:00, 1.94MB/s]
Downloading tokenizer.json: 100%|██████████| 468k/468k [00:00<00:00, 5.57MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 12.2kB/s]
                                                                     

In [13]:
tokenized_datasets['train']['Text'][0]

' Оглянись ― и увидишь наверно: в переулке такси тарахтят, за церковной оградой деревья над ребенком больным шелестят, '

In [14]:
def preprocess_labels(example):
    example['Label'] = int(example['Label'])
    return example

tokenized_datasets = tokenized_datasets.map(preprocess_labels)

                                                                      

In [15]:
def convert_to_tensors(dataset):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_mask = torch.tensor(dataset['attention_mask'])
    labels = torch.tensor(dataset['Label'])
    return input_ids, attention_mask, labels

train_input_ids, train_attention_mask, train_labels = convert_to_tensors(tokenized_datasets['train'])
val_input_ids, val_attention_mask, val_labels = convert_to_tensors(tokenized_datasets['validation'])

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny", num_labels=2)

Downloading model.safetensors: 100%|██████████| 47.7M/47.7M [00:06<00:00, 7.72MB/s]
Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassif

In [17]:
device = 'cuda'
model.to(device)

# Определяем оптимизатор
optimizer = AdamW(model.parameters(), lr=2e-5)



In [18]:
output_dir = "./results"
os.makedirs(output_dir, exist_ok=True)

In [19]:
def compute_metrics(preds, labels):
    preds = np.argmax(preds, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return accuracy, precision, recall, f1

In [20]:
model.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, element

In [None]:
# ОБЦИЛА НЕСКОЛЬКО ЭПОХ + ПРОВЕРИЛА НА OOS ВЫБОРКЕ, ОСТАНОВИЛАСЬ НА 3 ЭПОХЕ. В ДАЛЬНЕЙШЕМ ИСПОЛЬЗУЕТСЯ ОНА.

In [22]:
num_epochs = 10
batch_size = 4

for epoch in trange(num_epochs):
    # Обучение
    model.train()
    total_loss = 0
    total_preds, total_labels = [], []

    for i in trange(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size].to(device)
        attention_mask_batch = train_attention_mask[i:i+batch_size].to(device)
        labels_batch = train_labels[i:i+batch_size].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels_batch.cpu().numpy()
        total_preds.append(logits)
        total_labels.append(label_ids)
    
    avg_train_loss = total_loss / (len(train_input_ids) / batch_size)
    total_preds = np.concatenate(total_preds, axis=0)
    total_labels = np.concatenate(total_labels, axis=0)
    train_accuracy, train_precision, train_recall, train_f1 = compute_metrics(total_preds, total_labels)

    # Валидация
    model.eval()
    total_val_loss = 0
    total_val_preds, total_val_labels = [], []

    with torch.no_grad():
        for i in range(0, len(val_input_ids), batch_size):
            input_ids_batch = val_input_ids[i:i+batch_size].to(device)
            attention_mask_batch = val_attention_mask[i:i+batch_size].to(device)
            labels_batch = val_labels[i:i+batch_size].to(device)
            
            outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
            loss = outputs.loss
            
            total_val_loss += loss.item()
            logits = outputs.logits.detach().cpu().numpy()
            label_ids = labels_batch.cpu().numpy()
            total_val_preds.append(logits)
            total_val_labels.append(label_ids)
    
    avg_val_loss = total_val_loss / (len(val_input_ids) / batch_size)
    total_val_preds = np.concatenate(total_val_preds, axis=0)
    total_val_labels = np.concatenate(total_val_labels, axis=0)
    val_accuracy, val_precision, val_recall, val_f1 = compute_metrics(total_val_preds, total_val_labels)

    # Сохранение модели после каждой эпохи
    model.save_pretrained(f"{output_dir}/model_epoch_{epoch + 1}")
    tokenizer.save_pretrained(f"{output_dir}/model_epoch_{epoch + 1}")

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

100%|██████████| 44519/44519 [37:37<00:00, 19.72it/s]
 10%|█         | 1/10 [39:29<5:55:26, 2369.58s/it]

Epoch 1/10
Train Loss: 0.4961, Accuracy: 0.7460, Precision: 0.7621, Recall: 0.7460, F1: 0.7419
Val Loss: 0.4753, Accuracy: 0.7592, Precision: 0.7921, Recall: 0.7592, F1: 0.7520


100%|██████████| 44519/44519 [37:16<00:00, 19.90it/s]
 20%|██        | 2/10 [1:18:34<5:13:59, 2354.99s/it]

Epoch 2/10
Train Loss: 0.4584, Accuracy: 0.7730, Precision: 0.7856, Recall: 0.7730, F1: 0.7704
Val Loss: 0.4729, Accuracy: 0.7676, Precision: 0.7925, Recall: 0.7676, F1: 0.7624


100%|██████████| 44519/44519 [37:53<00:00, 19.58it/s]
 30%|███       | 3/10 [1:58:19<4:36:21, 2368.76s/it]

Epoch 3/10
Train Loss: 0.4281, Accuracy: 0.7945, Precision: 0.8044, Recall: 0.7945, F1: 0.7927
Val Loss: 0.4789, Accuracy: 0.7687, Precision: 0.7960, Recall: 0.7687, F1: 0.7631


100%|██████████| 44519/44519 [38:10<00:00, 19.43it/s]
 40%|████      | 4/10 [2:38:22<3:58:13, 2382.22s/it]

Epoch 4/10
Train Loss: 0.4008, Accuracy: 0.8124, Precision: 0.8198, Recall: 0.8124, F1: 0.8113
Val Loss: 0.4956, Accuracy: 0.7685, Precision: 0.7836, Recall: 0.7685, F1: 0.7653


 57%|█████▋    | 25344/44519 [24:10<18:17, 17.48it/s]
 40%|████      | 4/10 [3:02:32<4:33:48, 2738.12s/it]


KeyboardInterrupt: 

In [23]:
num_epochs = 10
batch_size = 4

for epoch in trange(5, num_epochs):
    # Обучение
    model.train()
    total_loss = 0
    total_preds, total_labels = [], []

    for i in trange(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size].to(device)
        attention_mask_batch = train_attention_mask[i:i+batch_size].to(device)
        labels_batch = train_labels[i:i+batch_size].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels_batch.cpu().numpy()
        total_preds.append(logits)
        total_labels.append(label_ids)
    
    avg_train_loss = total_loss / (len(train_input_ids) / batch_size)
    total_preds = np.concatenate(total_preds, axis=0)
    total_labels = np.concatenate(total_labels, axis=0)
    train_accuracy, train_precision, train_recall, train_f1 = compute_metrics(total_preds, total_labels)

    # Валидация
    model.eval()
    total_val_loss = 0
    total_val_preds, total_val_labels = [], []

    with torch.no_grad():
        for i in range(0, len(val_input_ids), batch_size):
            input_ids_batch = val_input_ids[i:i+batch_size].to(device)
            attention_mask_batch = val_attention_mask[i:i+batch_size].to(device)
            labels_batch = val_labels[i:i+batch_size].to(device)
            
            outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
            loss = outputs.loss
            
            total_val_loss += loss.item()
            logits = outputs.logits.detach().cpu().numpy()
            label_ids = labels_batch.cpu().numpy()
            total_val_preds.append(logits)
            total_val_labels.append(label_ids)
    
    avg_val_loss = total_val_loss / (len(val_input_ids) / batch_size)
    total_val_preds = np.concatenate(total_val_preds, axis=0)
    total_val_labels = np.concatenate(total_val_labels, axis=0)
    val_accuracy, val_precision, val_recall, val_f1 = compute_metrics(total_val_preds, total_val_labels)

    # Сохранение модели после каждой эпохи
    model.save_pretrained(f"{output_dir}/model_epoch_{epoch + 1}")
    tokenizer.save_pretrained(f"{output_dir}/model_epoch_{epoch + 1}")

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

100%|██████████| 44519/44519 [45:19<00:00, 16.37it/s]
 20%|██        | 1/5 [47:27<3:09:48, 2847.04s/it]

Epoch 6/10
Train Loss: 0.3566, Accuracy: 0.8381, Precision: 0.8428, Recall: 0.8381, F1: 0.8376
Val Loss: 0.5057, Accuracy: 0.7682, Precision: 0.7912, Recall: 0.7682, F1: 0.7635


100%|██████████| 44519/44519 [38:10<00:00, 19.44it/s]
 40%|████      | 2/5 [1:27:36<2:09:29, 2589.67s/it]

Epoch 7/10
Train Loss: 0.3319, Accuracy: 0.8518, Precision: 0.8553, Recall: 0.8518, F1: 0.8514
Val Loss: 0.5373, Accuracy: 0.7635, Precision: 0.7790, Recall: 0.7635, F1: 0.7601


100%|██████████| 44519/44519 [38:59<00:00, 19.03it/s]
 60%|██████    | 3/5 [2:08:28<1:24:13, 2526.78s/it]

Epoch 8/10
Train Loss: 0.3074, Accuracy: 0.8658, Precision: 0.8682, Recall: 0.8658, F1: 0.8655
Val Loss: 0.5458, Accuracy: 0.7613, Precision: 0.7725, Recall: 0.7613, F1: 0.7587


 84%|████████▎ | 37211/44519 [39:26<07:44, 15.72it/s]
 60%|██████    | 3/5 [2:47:55<1:51:56, 3358.46s/it]


KeyboardInterrupt: 