<a href="https://colab.research.google.com/github/zhus-dika/NLP_home_works/blob/main/toxic_comments_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.kaggle.com/competitions/toxic-comments-classification-2023/overview

### 🦚 Read data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
PATH_TO_TRAIN_DATA = '/content/drive/MyDrive/hse/NLP/train_data.csv'
PATH_TO_TEST_DATA = '/content/drive/MyDrive/hse/NLP/test_data.csv'

In [3]:
import pandas as pd
from tqdm.notebook import tqdm

df = pd.read_csv(PATH_TO_TRAIN_DATA)
df_competition = pd.read_csv(PATH_TO_TEST_DATA)
df.head()

Unnamed: 0,comment,toxic
0,Преступление и наказание\n,0.0
1,"И именно эти неработающие весы показывают, что...",0.0
2,"В Японии такие панельки, ебанько.\n",0.0
3,Еще у нас выявляют трещины с помощью белой кра...,0.0
4,"Дочитал до поезда в Норильск , дальше не стал\n",0.0


###🦩 Install need packages

In [4]:
! pip install torch



In [5]:
! pip install -U accelerate
! pip install -U transformers



###🐋 Preprocess data

In [6]:
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = "bert-base-cased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 4
id2label = {k:k for k in range(2)}
label2id = {k:k for k in range(2)}
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [9]:
df['label'] = df['toxic'].astype(int)
df['text'] = df['comment']

In [10]:
df = df.drop(['toxic','comment'], axis=1)

In [11]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state=777) # <- для локального тестирования

In [12]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
!pip install datasets



In [14]:
from datasets import Dataset

ds_train = Dataset.from_pandas(df_train)
ds_test = Dataset.from_pandas(df_test)

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [16]:
tokenized_train_datasets = ds_train.map(tokenize_function, batched=True)
tokenized_test_datasets = ds_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/8106 [00:00<?, ? examples/s]

Map:   0%|          | 0/2703 [00:00<?, ? examples/s]

In [17]:
tokenized_train_datasets['label'][0]

0

### 🐖 Prepare training

In [18]:
!pip install evaluate



In [19]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
# import numpy as np
# from datasets import load_metric

# metric = load_metric("accuracy")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = torch.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/bert_pretrained",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

### 🐥 Training

In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_test_datasets,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5348,0.385962,0.836848
2,0.3642,0.335997,0.866445
3,0.2709,0.311678,0.880133
4,0.2078,0.350436,0.882353


TrainOutput(global_step=2028, training_loss=0.3424307284270518, metrics={'train_runtime': 1052.9179, 'train_samples_per_second': 30.794, 'train_steps_per_second': 1.926, 'total_flos': 8531112858992640.0, 'train_loss': 0.3424307284270518, 'epoch': 4.0})

### 🦙 Evaluate competition data

In [23]:
encoded = tokenizer(df_competition['comment'].values[228], truncation=True, padding="max_length", max_length=256, return_tensors="pt").to("cuda")
# Get the class
pred_class = torch.argmax(model(**encoded).logits, axis=1)
print(pred_class.cpu().numpy()[0])

1


In [24]:
comp_results = [torch.argmax(model(**tokenizer(item, truncation=True, padding="max_length", max_length=256, return_tensors="pt").to("cuda")).logits, axis=1).cpu().numpy()[0] for item in df_competition['comment'].values]

In [25]:
comp_results[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

In [26]:
df_res = df_competition.drop(['comment'], axis=1)
df_res = df_res.assign(toxic=comp_results)
df_res.toxic.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3603 entries, 0 to 3602
Series name: toxic
Non-Null Count  Dtype
--------------  -----
3603 non-null   int64
dtypes: int64(1)
memory usage: 28.3 KB


In [28]:
df_res.to_csv('/content/drive/MyDrive/hse/NLP/results_comp_bert.csv', index=False)