# Классификация комантариев для "Викишоп" с BERT

## Общие сведения о проекте

### Описание проекта

* Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 
* Нужно обучить модель классифицировать комментарии на позитивные и негативные. Есть набор данных с разметкой о токсичности правок.
* Построить модель со значением метрики качества F1 не меньше 0.75.

### План

* Загрузка и предобработка данных
* Обучение разных моделей
* Выводы

## Импорты библиотек и инициализация констант

In [1]:
import pandas as pd
import torch
import transformers
import numpy as np
from tqdm import notebook
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.dummy import DummyClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from typing import Literal

In [2]:
RANDOM_STATE = 12345

## Загрузка и подготовка данных

In [3]:
try:
    data = pd.read_csv('../../datasets/toxic_comments.csv')
except:
    data = pd.read_csv('https://code.s3.yandex.net/datasets/toxic_comments.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159292 non-null  object
 1   toxic   159292 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [7]:
data.isna().sum()

text     0
toxic    0
dtype: int64

In [8]:
data.duplicated().sum()

0

Отсеиваем тексты длинее 512 символов, чтобы попасть в максимум для модели.

In [9]:
data = data[data['text'].apply(lambda x: len(x) < 512)]

In [10]:
data['toxic'].value_counts()

toxic
0    112120
1     14156
Name: count, dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 126276 entries, 0 to 159291
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    126276 non-null  object
 1   toxic   126276 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.9+ MB


### Достаем эмбединги

In [12]:
tokenizer = transformers.BertTokenizer(
    vocab_file='../../datasets/ds_bert/vocab.txt')

tokenized = data['text'].apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [13]:
padded.shape

(126276, 380)

In [14]:
config = transformers.BertConfig.from_json_file(
    '../../datasets/ds_bert/config.json')
model = transformers.BertModel.from_pretrained(
    '../../datasets/ds_bert/pytorch_model.bin', config=config, ignore_mismatched_sizes=True)

In [15]:
batch_size = 100
embeddings = []
for i in notebook.tqdm(range(10)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)])
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
embeddings[0][0]

array([ 2.50975698e-01,  5.05423099e-02, -2.41583213e-03, -2.32461706e-01,
       -3.06949347e-01, -3.72465223e-01,  5.56583583e-01,  5.22573173e-01,
        2.18740329e-01, -4.53634828e-01, -1.21841833e-01,  1.39127582e-01,
       -2.81780183e-01,  3.17809999e-01,  4.88871336e-01,  1.60019457e-01,
       -4.66758549e-01,  5.40292859e-01,  2.01923609e-01,  6.74561486e-02,
       -8.11724737e-03, -3.54630709e-01,  1.30774662e-01, -7.25796446e-02,
        8.74604210e-02, -7.07297474e-02, -1.83526099e-01, -3.89893442e-01,
       -2.17679828e-01,  6.55538589e-03, -2.71841615e-01,  2.25761846e-01,
        8.48649964e-02, -1.45497113e-01,  7.10861683e-01, -3.48648608e-01,
        1.80210590e-01, -7.64386505e-02,  3.92408639e-01,  2.65865684e-01,
       -8.13829750e-02,  1.14875138e-01,  4.10840660e-01,  1.30273402e-01,
        7.13771582e-02, -6.27804734e-03, -3.24370885e+00, -1.47058457e-01,
       -1.93767503e-01, -3.22881341e-01,  4.09788758e-01, -3.40376407e-01,
        2.70480424e-01,  

In [16]:
features = np.concatenate(embeddings)

In [21]:
features.shape

(1000, 768)

## Обучение моделей

In [22]:
# Класс для работы с моделями
class MultiModelLearning:
    def __init__(self, model_name: Literal['linear', 'tree', 'bagging', 'boosting', 'dummy', 'ANN']):
        self.model_name = model_name
        if model_name == 'linear':
            self.model = LogisticRegression(max_iter=768, class_weight="balansed")
        elif model_name == 'tree':
            self.model = DecisionTreeClassifier(random_state=RANDOM_STATE)
        elif model_name == 'bagging':
            self.model = BaggingClassifier(random_state=RANDOM_STATE)
        elif model_name == 'boosting':
            self.model = LGBMClassifier(random_state=RANDOM_STATE)
        elif model_name == 'dummy':
            self.model = DummyClassifier(strategy="mean")
        elif model_name == 'ANN':
            self.model = MLPClassifier(random_state=RANDOM_STATE, solver='adam', activation='tanh', error_score='raise')
        else:
            display("Было введено неверное имя модели")
        self.features = None
        self.target = None
        self.best_model = None
        self.best_score = None
        self.best_params = None
        self.results = None

    # Подбор гиперпараметров для модели
    def select_hyperparameters__(self, param_dist):
        grid_search = GridSearchCV(self.model, param_grid=param_dist, scoring='f1')
        grid_search.fit(self.features, self.target)
        self.best_model = grid_search.best_estimator_
        self.hyperparameters = grid_search.best_params_
        self.best_score = grid_search.best_score_
        self.results = grid_search.cv_results_

    # Функции обучения моделей
    def learn_linear_regression__(self): 
        param_dist = {}
        self.select_hyperparameters__(param_dist)
    
    def learn_tree__(self):
        param_dist = {
            'max_depth': [i for i in range(1,15)]
        }
        self.select_hyperparameters__(param_dist)

    def learn_bagging__(self):
        param_dist = {
            'n_estimators': [i for i in range(1, 8)],
        }
        self.select_hyperparameters__(param_dist)
        
    def learn_boosting__(self):
        param_dist = {
            'n_estimators': [2,5,10,25,50,100],
            'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
        }
        self.select_hyperparameters__(param_dist)

    def learn_dummy__(self):
        self.best_model = self.model.fit(self.features, self.target)
        self.best_score = sum(cross_val_score(self.model, self.features, self.target, scoring='f1'))/5

    def learn_ANN__(self):
        param_dist = {
            'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
            'hidden_layer_sizes': [[i for i in range(5, 20)], [[i, i+1] for i in range(5, 20)]]
        }
        self.select_hyperparameters__(param_dist)



    def select_model__(self):
        if self.model_name == 'linear':
            self.learn_linear_regression__()
        elif self.model_name == 'tree':
            self.learn_tree__()
        elif self.model_name == 'bagging':
            self.learn_bagging__()
        elif self.model_name == 'boosting':
            self.learn_boosting__()
        elif self.model_name == 'dummy':
            self.learn_dummy__()
        elif self.model_name == 'ANN':
            self.learn_ANN__()
        else:
            display("Было введено неверное имя модели")

    def fit(self, features, target):
        self.features = features
        self.target = target
        self.select_model__()


    # Предсказание модели
    def predict(self, features):
        model_predictions = self.best_model.predict(features)
        return model_predictions

In [19]:
1/0

ZeroDivisionError: division by zero

In [None]:
features.shape

(20000, 768)

In [23]:
model = LogisticRegression(max_iter=768)
score = sum(cross_val_score(model, features, data['toxic'][:20000], scoring='f1'))/5
score

0.7126577185386342

In [None]:
model_tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
score = sum(cross_val_score(model_tree, features, data['toxic'][:20000], scoring='f1'))/5
score

0.4131907879322075

In [None]:
linear = MultiModelLearning('linear')
linear.fit(features, data['toxic'][:20000])
linear.best_score

0.7126577185386342

In [None]:
tree = MultiModelLearning('tree')
tree.fit(features, data['toxic'][:20000])
tree.best_score

In [None]:
boosting = MultiModelLearning('boosting')
boosting.fit(features, data['toxic'][:20000])
boosting.best_score

[LightGBM] [Info] Number of positive: 1822, number of negative: 14178
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113875 -> initscore=-2.051757
[LightGBM] [Info] Start training from score -2.051757
[LightGBM] [Info] Number of positive: 1822, number of negative: 14178
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113875 -> initscore=-2.051757
[LightGBM] [Info] Start training from score -2.051757
[LightGB

0.6469968168108502

In [None]:
ann = MultiModelLearning('ANN')
ann.fit(features, data['toxic'][:20000])
ann.best_score

35 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "e:\programs\anaconda\envs\practicum\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\programs\anaconda\envs\practicum\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "e:\programs\anaconda\envs\practicum\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 751, in fit
    return self._fit(X, y, incremental=False)
  File "e:\programs\anaconda\envs\practicum\lib\site-packages\sklearn\neural_

0.6896117683042396

In [None]:
bagging = MultiModelLearning('bagging')
bagging.fit(features, data['toxic'][:20000])
bagging.best_score

0.5085077662231516