# CatBoost и Bert
Загрузим файл с данными из яндекс практикума

Попробуем CatBoost и Bert в GPU режиме


In [None]:
%%bash
pip -q install transformers
pip -q install catboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

#import warnings
#warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from joblib import dump, load

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm

from catboost import Pool, CatBoostClassifier

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

PATH = '/content/drive/My Drive/Colab Notebooks/data/'
SEED = 21

In [None]:
data = load(os.path.join(PATH, 'toxic_comments_ready'))
data.shape

(159571, 11)

In [None]:
X = data.drop(columns=['toxic'])
y = data['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    random_state=SEED)
X_train_valid, X_valid, y_train_valid, y_valid = train_test_split(X_train, y_train,
                                                                 shuffle=True,
                                                                 test_size=0.2,
                                                                 random_state=SEED)
print('Train_valid shapes:')
print(X_train_valid.shape, y_train_valid.shape)
print('Valid shapes:')
print(X_valid.shape, y_valid.shape)
print('Test shapes:')
print(X_test.shape, y_test.shape)

Train_valid shapes:
(102124, 10) (102124,)
Valid shapes:
(25532, 10) (25532,)
Test shapes:
(31915, 10) (31915,)


##CatBoost

In [None]:
meta_features = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length',
                'char_count', 'punctuation_count']

learn_pool = Pool(
    X_train_valid[meta_features+['text_lemma']], 
    y_train_valid, 
    text_features=['text_lemma'],
)
test_pool = Pool(
    X_valid[meta_features+['text_lemma']], 
    y_valid,
    text_features=['text_lemma'],
)

ctb = CatBoostClassifier(iterations=1000,
                         random_seed=SEED,
                         eval_metric='F1',
                         task_type='GPU',
                         od_type='Iter',
                         early_stopping_rounds=100)

In [None]:
ctb.fit(learn_pool, eval_set=test_pool, verbose=100)

Learning rate set to 0.048056
0:	learn: 0.6840574	test: 0.7211256	best: 0.7211256 (0)	total: 27.7ms	remaining: 27.7s
100:	learn: 0.7273732	test: 0.7441558	best: 0.7445887 (99)	total: 2.32s	remaining: 20.6s
200:	learn: 0.7452300	test: 0.7565124	best: 0.7565124 (192)	total: 4.49s	remaining: 17.8s
300:	learn: 0.7532610	test: 0.7611684	best: 0.7618639 (281)	total: 6.62s	remaining: 15.4s
400:	learn: 0.7595773	test: 0.7644788	best: 0.7651093 (394)	total: 8.61s	remaining: 12.9s
500:	learn: 0.7642178	test: 0.7662671	best: 0.7666952 (495)	total: 10.5s	remaining: 10.5s
600:	learn: 0.7679903	test: 0.7677199	best: 0.7679487 (597)	total: 12.5s	remaining: 8.31s
700:	learn: 0.7714489	test: 0.7693621	best: 0.7701493 (684)	total: 14.5s	remaining: 6.17s
800:	learn: 0.7737378	test: 0.7715260	best: 0.7716905 (799)	total: 16.4s	remaining: 4.07s
900:	learn: 0.7763251	test: 0.7720494	best: 0.7723785 (870)	total: 18.3s	remaining: 2.01s
999:	learn: 0.7784942	test: 0.7720119	best: 0.7726983 (990)	total: 20.1s	r

<catboost.core.CatBoostClassifier at 0x7f332f3f6048>

In [None]:
y_pred_valid = ctb.predict(X_valid[meta_features+['text_lemma']])

dump(ctb.predict_proba(X_valid[meta_features+['text_lemma']]),
     os.path.join(PATH, 'ctb_predict_valid'))

f1_score(y_valid, y_pred_valid)

0.7726982776950883

Получили скор чуть меньше чем Tf-idf + Logit с тюннгом гиперпараметров

##Bert

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
model.eval();

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
%%time
#tqdm.pandas()
tokenized = data['text_clean'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512))

CPU times: user 4min 1s, sys: 250 ms, total: 4min 2s
Wall time: 4min 2s


In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
        
max_len

512

In [None]:
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
padded.shape

(159571, 512)

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(159571, 512)

In [None]:
batch_size = 32
embeddings = []

for i in tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        batch = batch.to('cuda')
        attention_mask_batch = attention_mask_batch.to('cuda')
        model.to('cuda')

        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())
       
#добавим остаток который не попал в батчи:
last_batch_idx = (padded.shape[0] // batch_size) * batch_size
rest_idx = list(range(last_batch_idx, padded.shape[0]))

batch = torch.LongTensor(padded[rest_idx]) 
attention_mask_batch = torch.LongTensor(attention_mask[rest_idx])
        
batch = batch.to('cuda')
attention_mask_batch = attention_mask_batch.to('cuda')
model.to('cuda')

with torch.no_grad():
    batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())

dump(np.concatenate(embeddings), os.path.join(PATH, 'embeds'))


  0%|          | 0/4986 [00:00<?, ?it/s][A
  0%|          | 1/4986 [00:02<3:03:20,  2.21s/it][A
  0%|          | 2/4986 [00:04<2:55:52,  2.12s/it][A
  0%|          | 3/4986 [00:06<2:51:08,  2.06s/it][A
  0%|          | 4/4986 [00:07<2:47:22,  2.02s/it][A
  0%|          | 5/4986 [00:09<2:44:52,  1.99s/it][A
  0%|          | 6/4986 [00:11<2:43:26,  1.97s/it][A

KeyboardInterrupt: ignored

Прошу не обращать внимания на ошибку.
Время выполнения варьируется от часа до 4х, и колаб иногда рвет сессии и выполнение останавливается. 
Далее загружаются файлы после успешного создания ембедингов.

In [None]:
feat = load(os.path.join(PATH, 'embeds'))
assert feat.shape[0] == data.shape[0]

In [None]:
X_train_vec = feat[X_train.index]
X_train_valid_vec = feat[X_train_valid.index]
X_test_vec = feat[X_test.index]
X_valid_vec = feat[X_valid.index]

In [None]:
logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear')

logit.fit(X_train_valid_vec, y_train_valid)

y_pred_valid = logit.predict(X_valid_vec)
dump(y_pred_valid, 'y_pred_valid_bert')

f1 = f1_score(y_valid, y_pred_valid)
dump(f1, os.path.join(PATH, 'f1_nlp.txt'))
f1

0.7213252254141329

Получили скор значительно ниже, чем для других моделей, наверно я тут что-то делаю нет так.

Есть статья где используют берт на этих данных, у меня пока не хватает знаний её понять:
- [en] https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d
- [ru] https://neurohive.io/ru/tutorial/bert-klassifikacya-teksta/


##Final CatBoost

Обучим катбуст на всей трейн выборке и выгрузим предсказания для теста

In [None]:
train_pool = Pool(
    X_train[meta_features+['text_lemma']], 
    y_train, 
    text_features=['text_lemma'],
)

ctb = CatBoostClassifier(iterations=1000,
                         random_seed=SEED,
                         eval_metric='F1',
                         task_type='GPU',
                         )

In [None]:
ctb.fit(train_pool, verbose=100)

In [None]:
dump(ctb.predict_proba(X_test[meta_features+['text_lemma']]),
     os.path.join(PATH, 'ctb_test'))

# New Section