In [24]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
from torch.utils.data import DataLoader
from tqdm import trange
import numpy as np

In [25]:
df = pd.read_csv('./Corrupted Data/total_data_for_corrupt_classif_noBR.csv', index_col=['Unnamed: 0'])
df = df.reset_index()
df = df.drop(columns=['index'])

In [26]:
df.head()

Unnamed: 0,Text,Label
0,"О человеки все цветов! ― Сказал, зевая, Саваоф...",0
1,"В Афинее осторожно Свиток разверня, Весь проч...",0
2,"Серо-черной, не очень суровой зимою в низкорос...",0
3,"Всё сохраню, всё пронесу, ― И вечность, что о...",0
4,"Приятности твои на мысли вображая, В пустынях...",0


In [27]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=22)

In [28]:
train_df = train_df.sample(frac=1)
val_df = val_df.sample(frac=1)

In [29]:
print(len(train_df), len(val_df))

178075 31425


In [7]:
len(train_df[train_df['Label']==1])

88830

In [30]:
len(val_df[val_df['Label']==1])

15670

In [31]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [32]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [33]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")

def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

                                                                     

In [34]:
tokenized_datasets['train']['Text'][0]

'Берлогу он свою сразу сумерки светлей, Из узнал глаза чем залога по. В была долгого, когда бирюза Она не он открыл выкупе.'

In [35]:
def preprocess_labels(example):
    example['Label'] = int(example['Label'])
    return example

tokenized_datasets = tokenized_datasets.map(preprocess_labels)

                                                                      

In [36]:
def convert_to_tensors(dataset):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_mask = torch.tensor(dataset['attention_mask'])
    labels = torch.tensor(dataset['Label'])
    return input_ids, attention_mask, labels

train_input_ids, train_attention_mask, train_labels = convert_to_tensors(tokenized_datasets['train'])
val_input_ids, val_attention_mask, val_labels = convert_to_tensors(tokenized_datasets['validation'])

In [37]:
model = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny", num_labels=2)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny a

In [38]:
device = 'cuda'
model.to(device)

# Определяем оптимизатор
optimizer = AdamW(model.parameters(), lr=2e-5)



In [39]:
output_dir = "./results_gini"
os.makedirs(output_dir, exist_ok=True)

In [40]:
def compute_metrics(preds, labels):
    preds = np.argmax(preds, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return accuracy, precision, recall, f1

In [41]:
def gini_coefficient(preds, labels):
    assert len(preds) == len(labels)
    all_data = list(zip(preds.flatten(), labels.flatten()))
    all_data.sort(key=lambda x: x[0])
    total_labels = sum(label for _, label in all_data)
    gini_sum = 0.0
    cum_labels = 0.0
    for i, (_, label) in enumerate(all_data):
        cum_labels += label
        gini_sum += cum_labels - (total_labels / len(all_data))
    gini_coefficient = gini_sum / total_labels
    return gini_coefficient / len(all_data)


In [42]:
from sklearn.metrics import precision_recall_curve

def find_optimal_threshold(preds, labels):
    precision, recall, thresholds = precision_recall_curve(labels, preds)
    f1_scores = 2 * recall * precision / (recall + precision)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold


In [43]:
model.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, element

In [44]:
# ОБЦИЛА НЕСКОЛЬКО ЭПОХ + ПРОВЕРИЛА НА OOS ВЫБОРКЕ, ОСТАНОВИЛАСЬ НА 3 ЭПОХЕ. В ДАЛЬНЕЙШЕМ ИСПОЛЬЗУЕТСЯ ОНА.

In [45]:
def compute_metrics(preds, labels):
    # Dummy implementation
    accuracy = (preds.argmax(axis=1) == labels).mean()
    precision = recall = f1 = accuracy
    return accuracy, precision, recall, f1

def gini_coefficient(preds, labels):
    assert len(preds) == len(labels)
    all_data = list(zip(preds.flatten(), labels.flatten()))
    all_data.sort(key=lambda x: x[0])
    total_labels = sum(label for _, label in all_data)
    gini_sum = 0.0
    cum_labels = 0.0
    for i, (_, label) in enumerate(all_data):
        cum_labels += label
        gini_sum += cum_labels - (total_labels / len(all_data))
    gini_coefficient = gini_sum / total_labels
    return gini_coefficient / len(all_data)

from sklearn.metrics import precision_recall_curve

def find_optimal_threshold(preds, labels):
    precision, recall, thresholds = precision_recall_curve(labels, preds.flatten())
    f1_scores = 2 * recall * precision / (recall + precision)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

In [47]:
num_epochs = 4
batch_size = 4

for epoch in trange(num_epochs):
    # Обучение
    model.train()
    total_loss = 0
    total_preds, total_labels = [], []

    for i in trange(0, len(train_input_ids), batch_size):
        input_ids_batch = train_input_ids[i:i+batch_size].to(device)
        attention_mask_batch = train_attention_mask[i:i+batch_size].to(device)
        labels_batch = train_labels[i:i+batch_size].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels_batch.cpu().numpy()
        total_preds.append(logits)
        total_labels.append(label_ids)
    
    avg_train_loss = total_loss / (len(train_input_ids) / batch_size)
    total_preds = np.concatenate(total_preds, axis=0)
    total_labels = np.concatenate(total_labels, axis=0)
    train_accuracy, train_precision, train_recall, train_f1 = compute_metrics(total_preds, total_labels)

    gini_train = gini_coefficient(total_preds, total_labels)

    # Валидация
    model.eval()
    total_val_loss = 0
    total_val_preds, total_val_labels = [], []

    with torch.no_grad():
        for i in range(0, len(val_input_ids), batch_size):
            input_ids_batch = val_input_ids[i:i+batch_size].to(device)
            attention_mask_batch = val_attention_mask[i:i+batch_size].to(device)
            labels_batch = val_labels[i:i+batch_size].to(device)
            
            outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
            loss = outputs.loss
            
            total_val_loss += loss.item()
            logits = outputs.logits.detach().cpu().numpy()
            label_ids = labels_batch.cpu().numpy()
            total_val_preds.append(logits)
            total_val_labels.append(label_ids)
    
    avg_val_loss = total_val_loss / (len(val_input_ids) / batch_size)
    total_val_preds = np.concatenate(total_val_preds, axis=0)
    total_val_labels = np.concatenate(total_val_labels, axis=0)
    val_accuracy, val_precision, val_recall, val_f1 = compute_metrics(total_val_preds, total_val_labels)

    # Calculate Gini coefficient
    gini_val = gini_coefficient(total_val_preds, total_val_labels)

    # Find optimal threshold
    optimal_threshold = find_optimal_threshold(total_val_preds[:, 1], total_val_labels)

    # Сохранение модели после каждой эпохи
    model.save_pretrained(f"{output_dir}/model_epoch_{epoch + 1}")
    tokenizer.save_pretrained(f"{output_dir}/model_epoch_{epoch + 1}")

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
    print(f"Train Gini: {gini_train:.4f}, Val Gini: {gini_val:.4f}")

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/44519 [00:00<?, ?it/s]

100%|██████████| 44519/44519 [38:09<00:00, 19.45it/s]
 25%|██▌       | 1/4 [40:04<2:00:14, 2404.77s/it]

Epoch 1/4
Train Loss: 0.4698, Accuracy: 0.7661
Val Loss: 0.4676, Accuracy: 0.7709
Train Gini: 0.4995, Val Gini: 0.4979


100%|██████████| 44519/44519 [40:49<00:00, 18.17it/s]
 50%|█████     | 2/4 [1:22:45<1:23:12, 2496.35s/it]

Epoch 2/4
Train Loss: 0.4381, Accuracy: 0.7883
Val Loss: 0.4732, Accuracy: 0.7727
Train Gini: 0.4993, Val Gini: 0.4980


100%|██████████| 44519/44519 [37:36<00:00, 19.73it/s]
 75%|███████▌  | 3/4 [2:02:13<40:37, 2437.71s/it]  

Epoch 3/4
Train Loss: 0.4118, Accuracy: 0.8060
Val Loss: 0.4894, Accuracy: 0.7724
Train Gini: 0.4994, Val Gini: 0.4977


