In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

In [None]:
# let's store the csv dataset into a DataFrame and have a look at it
# Ссылка на скачивание данных в формате CSV
csv_data_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vT3Wns_gLiMDfacVgF-x4suNvnAFmgibVkrsizkKqwcVFGSYAVOVnJsoV7gm3jguw/pub?gid=681978523&single=true&output=csv"

# Загрузка данных в DataFrame
data = pd.read_csv(csv_data_url)

# Просмотр первых нескольких строк данных
print(data.head())

In [None]:
# Lets count duplicated entries in the problem_text column
# 'problem_text' имя столбца, в котором нужно найти дубликаты
duplicates = data.duplicated(subset=['problem_text'], keep=False)

# Считаем количество дубликатов
num_duplicates = duplicates.sum()

# Выводим количество дубликатов
print(f'Количество дубликатов в столбце "problem_text": {num_duplicates}')


In [None]:
# We'll leave the first entries and we'll drop the duplicates 
# Удаляем дубликаты, оставляя только первые вхождения
data_no_dupes = data.drop_duplicates(subset=['problem_text'], keep='first')

# Перезаписываем индексы после удаления дубликатов
data_no_dupes.reset_index(drop=True, inplace=True)


In [None]:
# We need to check the results. We expect 0 duplicated entries in the column we cleaned
#смотрим, как удалили 
duplicates = data_no_dupes.duplicated(subset=['problem_text'], keep=False)
num_duplicates = duplicates.sum()
print(f'Количество дубликатов в столбце "problem_text": {num_duplicates}')

In [None]:
# A quick visual aid to check the distribution over classes
# распределение по классам
import matplotlib.pyplot as plt

class_counts = data_no_dupes['topic'].value_counts()
class_counts.plot(kind='bar')
plt.title('New Distribution of Classes')
plt.xlabel('Topics')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

In [None]:
class_counts

In [None]:
# We'll need to augment the datset to balance the distribution 
# but beforehand we should split the data set into test train val sets 
#разобьем на test train val
from sklearn.model_selection import train_test_split

# Разделение датасета
train, test_val = train_test_split(data_no_dupes, test_size=0.4, random_state=42)
val, test = train_test_split(test_val, test_size=0.5, random_state=42)

# сохраним
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)


In [None]:
!ls -la /kaggle/working

In [None]:
train.head()

In [None]:
# We'll augment the train set with https://github.com/makcedward/nlpaug 
# будем выравнивать датасет по классам с помощью https://github.com/makcedward/nlpaug 
!pip install nlpaug numpy matplotlib python-dotenv


In [None]:
# Lets see how the classes are represeted in our train set
import matplotlib.pyplot as plt

class_counts = train['topic'].value_counts()
class_counts.plot(kind='bar')
plt.title('New Distribution of Classes')
plt.xlabel('Topics')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

In [None]:
class_counts

In [None]:
import torch
torch.cuda.empty_cache()
torch.autograd.set_detect_anomaly(True)


In [None]:
# for Kaggle & Bert tokenizer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
!nvidia-smi

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0' 

print('ok')

In [None]:
!pip uninstall tensorflow -y


In [None]:
# actual augmenting happens here

import nlpaug.augmenter.word as naw
import pandas as pd
import torch
torch.set_printoptions(profile="full")

torch.set_num_threads(1) # I had to do that due to some obscure mistake


# Инициализация аугментатора BERT
# aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device='cpu')
# aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device='cuda')
# Use PyTorch as the backend
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device='cuda', force_reload=True)

# Определение максимального количества примеров в классе
max_samples = train['topic'].value_counts().max()

# Создание пустого DataFrame для аугментированных данных
augmented_data = pd.DataFrame(columns=train.columns)

# Аугментация данных
for topic in train['topic'].unique():
    # Вычисление необходимого количества аугментаций для класса
    current_count = train[train['topic'] == topic].shape[0]
    augment_count = max_samples - current_count
    
    if augment_count > 0:
        # Выборка случайных примеров для аугментации
        sample = train[train['topic'] == topic].sample(n=augment_count, replace=True)
        
        # Применение аугментации
        sample['problem_text'] = sample['problem_text'].apply(lambda x: aug.augment(x))
        
        # Добавление аугментированных примеров в датасет
        augmented_data = pd.concat([augmented_data, sample])

# Объединение аугментированных данных с исходным обучающим датасетом
train_augmented = pd.concat([train, augmented_data])

# Перемешиваем данные
train_augmented = train_augmented.sample(frac=1).reset_index(drop=True)

# Сохраняем аугментированный датасет в новый CSV-файл
train_augmented.to_csv('train_augmented.csv', index=False)


In [None]:
len(train_augmented)

In [None]:
!ls -la /kaggle/working

In [None]:
# Lets marvel at the augmeted train set
import matplotlib.pyplot as plt

class_counts = train_augmented['topic'].value_counts()
class_counts.plot(kind='bar')
plt.title('New Distribution of Classes')
plt.xlabel('Topics')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

In [None]:
# There could be some duplicates creeping in after the augmetation
duplicates = train_augmented.duplicated(subset=['problem_text'], keep=False)

num_duplicates = duplicates.sum()

print(f'Количество дубликатов в столбце "problem_text": {num_duplicates}')

In [None]:
duplicates

In [None]:
# Downloading neccessities

import os
from IPython.display import FileLink

# Путь к файлу на вашем ноутбуке Kaggle
file_path = 'train_augmented.csv'

# Проверяем, существует ли файл
if os.path.isfile(file_path):
    # Создаем ссылку для скачивания
    download_link = FileLink(file_path, result_html_prefix="Click here to download: ")
    display(download_link)
else:
    print("Файл не найден")

In [None]:
import os
from IPython.display import FileLink

# Путь к файлу на вашем ноутбуке Kaggle
file_path = 'val.csv'

# Проверяем, существует ли файл
if os.path.isfile(file_path):
    # Создаем ссылку для скачивания
    download_link = FileLink(file_path, result_html_prefix="Click here to download: ")
    display(download_link)
else:
    print("Файл не найден")

In [None]:
import os
from IPython.display import FileLink

# Путь к файлу на вашем ноутбуке Kaggle
file_path = 'test.csv'

# Проверяем, существует ли файл
if os.path.isfile(file_path):
    # Создаем ссылку для скачивания
    download_link = FileLink(file_path, result_html_prefix="Click here to download: ")
    display(download_link)
else:
    print("Файл не найден")

In [None]:
!pip install transformers


In [None]:
train_augmented['problem_text']

In [None]:
from datasets import Dataset
import pandas as pd

# Assume train_augmented, val, and test are DataFrames
# Convert the column "problem_text" to string format
train_augmented['problem_text'] = train_augmented['problem_text'].astype(str)
val['problem_text'] = val['problem_text'].astype(str)
test['problem_text'] = test['problem_text'].astype(str)

# Convert the pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_augmented)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

# Load the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['problem_text'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ensure labels are included
train_dataset = train_dataset.map(lambda examples: {'labels': examples['topic']}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': examples['topic']}, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type='torch', columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type='torch', columns=["input_ids", "attention_mask"])


In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AdamW
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers.trainer_utils import EvalPrediction
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encode labels into integers
label_encoder = LabelEncoder()
train_augmented['labels'] = label_encoder.fit_transform(train_augmented['topic'])
val['labels'] = label_encoder.transform(val['topic'])
test['labels'] = label_encoder.transform(test['topic'])

# Convert the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_augmented)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['problem_text'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type='torch', columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type='torch', columns=["input_ids", "attention_mask", "labels"])

# Check first element
# print(train_dataset[0])

# Define the number of classes
num_labels = len(label_encoder.classes_)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=num_labels)

# Unfreeze all layers
for param in model.parameters():
    param.requires_grad = True

# Define the compute metrics function
def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    # Convert logits to a PyTorch tensor
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Try different learning rates
best_f1 = 0
best_lr = 0
best_trainer = None
for lr in [1e-5]:  #  , 2e-5, 3e-5
    print(f"Training with learning rate: {lr}")
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        save_steps=5000,
        save_total_limit=2,
        learning_rate=lr,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=(AdamW(model.parameters(), lr=lr), None),
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    evaluation = trainer.evaluate()
    f1 = evaluation.get('eval_f1', 0)

    if f1 > best_f1:
        best_f1 = f1
        best_lr = lr
        best_trainer = trainer

print(f"Best learning rate: {best_lr} with F1 Score: {best_f1}")

# Use the best trainer for predictions
predictions = best_trainer.predict(val_dataset)
true_labels = predictions.label_ids
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=-1)

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='macro', zero_division=1)
recall = recall_score(true_labels, predicted_labels, average='macro', zero_division=1)
f1 = f1_score(true_labels, predicted_labels, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
print('ok')