In [33]:
import numpy as np
import pandas as pd
import torch
import transformers

from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler

from catboost import CatBoostClassifier
from catboost import Pool

In [2]:
columns = ['id', 'tdate', 'tname', 'ttext', 'ttype', 'trep', 'trtw', 'tfav', 'tstcount', 'tfol', 'tfrien', 'listcount']

negative_df = pd.read_csv('./datasets/negative.csv', sep=';', names=columns)
positive_df = pd.read_csv('./datasets/positive.csv', sep=';', names=columns)

In [3]:
tweets_df = pd.concat([negative_df, positive_df])

del negative_df
del positive_df

In [4]:
tweets_df.set_index(['id'], inplace=True)
tweets_df.drop(labels='tname', axis=1, inplace=True)

In [5]:
tweets_df.loc[tweets_df['ttype'] == -1, 'ttype'] = 0

In [6]:
tweets_df = tweets_df.sample(n=2000, random_state=123456)

In [7]:
display(tweets_df.info())
display(tweets_df.head(5))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 409919406907867136 to 417728925910175744
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tdate      2000 non-null   int64 
 1   ttext      2000 non-null   object
 2   ttype      2000 non-null   int64 
 3   trep       2000 non-null   int64 
 4   trtw       2000 non-null   int64 
 5   tfav       2000 non-null   int64 
 6   tstcount   2000 non-null   int64 
 7   tfol       2000 non-null   int64 
 8   tfrien     2000 non-null   int64 
 9   listcount  2000 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 171.9+ KB


None

Unnamed: 0_level_0,tdate,ttext,ttype,trep,trtw,tfav,tstcount,tfol,tfrien,listcount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
409919406907867136,1386567377,@Maria_Brovko Может в сонник заглянуть? К чему...,1,0,0,0,1918,27,94,1
410869105265491968,1386793802,Закончим этот день предновогодним озорством ;)...,1,0,0,0,6020,114,189,1
410798792842039296,1386777039,"Мне Урупон шею свернет :-)\nЛента, я Вас любил...",1,0,0,0,19852,347,175,7
410738861644726272,1386762750,@_skylovesme_ посмотри мои твиты за сегодняшне...,1,0,0,0,28123,287,198,3
408910075903082496,1386326733,"Я так хотела побыстрее попробывать сырный суп,...",0,0,0,0,218,7,6,0


In [8]:
# инициализации токенизатора
tokenizer = transformers.BertTokenizer(vocab_file='./datasets/bert/vocab.txt')

# преобразование твитов в токены
tokenized = tweets_df['ttext'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0

for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

# выравнивание векторов по длине наибольшего вектора и заполнение нулями
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

# разбиение на значимые и не значимые токены
attention_mask = np.where(padded != 0, 1, 0)

In [9]:
# инициализации предобученной модели
config = transformers.BertConfig.from_json_file(
    './datasets/bert/bert_config.json')

model = transformers.BertModel.from_pretrained(
    './datasets/bert/pytorch_model.bin', config=config)

In [10]:
# преобразование векторов и маски моделью в признаки 

batch_size = 100
embeddings = []

for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
    embeddings.append(batch_embeddings[0][:,0,:].numpy())

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [11]:
features = np.concatenate(embeddings)
target = tweets_df['ttype']

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=123456)

In [12]:
model = LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=123456)
model.fit(features_train, target_train)

LogisticRegression(n_jobs=-1, random_state=123456)

In [13]:
predicted = model.predict(features_test)

model_f1_score = f1_score(target_test, predicted)
print(f'{type(model).__name__}: F1-score={model_f1_score}')

LogisticRegression: F1-score=0.9638095238095238


In [15]:
train_pool = Pool(features_train, target_train)
test_pool = Pool(features_test, target_test) 

In [17]:
cat_model = CatBoostClassifier(iterations=100000, early_stopping_rounds=1000, thread_count=-1, eval_metric='F1', random_state=123456)
cat_model.fit(train_pool, eval_set=test_pool, verbose=100)

Learning rate set to 0.004275
0:	learn: 0.8669355	test: 0.8043265	best: 0.8043265 (0)	total: 43ms	remaining: 1h 11m 41s
100:	learn: 0.9453441	test: 0.8940956	best: 0.9004651 (47)	total: 3.93s	remaining: 1h 4m 48s
200:	learn: 0.9704985	test: 0.9177570	best: 0.9177570 (198)	total: 7.82s	remaining: 1h 4m 42s
300:	learn: 0.9806320	test: 0.9269663	best: 0.9269663 (295)	total: 11.8s	remaining: 1h 4m 52s
400:	learn: 0.9897751	test: 0.9305816	best: 0.9314554 (377)	total: 15.7s	remaining: 1h 5m 3s
500:	learn: 0.9928205	test: 0.9350894	best: 0.9360902 (490)	total: 19.8s	remaining: 1h 5m 28s
600:	learn: 0.9938398	test: 0.9388523	best: 0.9388523 (584)	total: 23.8s	remaining: 1h 5m 33s
700:	learn: 0.9969168	test: 0.9408451	best: 0.9408451 (684)	total: 27.8s	remaining: 1h 5m 44s
800:	learn: 0.9989701	test: 0.9398496	best: 0.9408451 (684)	total: 31.9s	remaining: 1h 5m 45s
900:	learn: 1.0000000	test: 0.9398496	best: 0.9408451 (684)	total: 35.9s	remaining: 1h 5m 47s
1000:	learn: 1.0000000	test: 0.94161

<catboost.core.CatBoostClassifier at 0x1634ebb20>

In [34]:
scaler_columns = tweets_df.drop(columns=['tdate', 'ttext', 'ttype']).columns.to_list()

scaler = StandardScaler()
tweets_df.loc[:, scaler_columns] = pd.DataFrame(scaler.fit_transform(tweets_df.loc[:, scaler_columns]), index=tweets_df.index, columns=scaler_columns)

In [35]:
features_df = pd.DataFrame(features, index=tweets_df.index)
features_df = features_df.join(tweets_df.drop(labels=['tdate', 'ttext', 'ttype'], axis=1))

In [36]:
features_train, features_test, target_train, target_test = train_test_split(features_df, target, test_size=0.5, random_state=123456)

In [37]:
model = LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=123456)
model.fit(features_train, target_train)

LogisticRegression(n_jobs=-1, random_state=123456)

In [38]:
predicted = model.predict(features_test)

model_f1_score = f1_score(target_test, predicted)
print(f'{type(model).__name__}: F1-score={model_f1_score}')

LogisticRegression: F1-score=0.9629629629629628


In [40]:
train_pool = Pool(features_train, target_train)
test_pool = Pool(features_test, target_test) 

In [41]:
cat_model = CatBoostClassifier(iterations=100000, early_stopping_rounds=1000, thread_count=-1, eval_metric='F1', random_state=123456)
cat_model.fit(train_pool, eval_set=test_pool, verbose=100)

Learning rate set to 0.004275
0:	learn: 0.8835821	test: 0.8349328	best: 0.8349328 (0)	total: 48.3ms	remaining: 1h 20m 28s
100:	learn: 0.9503546	test: 0.9041353	best: 0.9041353 (100)	total: 3.96s	remaining: 1h 5m 15s
200:	learn: 0.9704383	test: 0.9178470	best: 0.9198869 (196)	total: 7.46s	remaining: 1h 1m 45s
300:	learn: 0.9785933	test: 0.9266917	best: 0.9266917 (293)	total: 11s	remaining: 1h 52s
400:	learn: 0.9866940	test: 0.9294450	best: 0.9294450 (380)	total: 14.7s	remaining: 1h 56s
500:	learn: 0.9918033	test: 0.9329556	best: 0.9330820 (477)	total: 19.1s	remaining: 1h 3m 6s
600:	learn: 0.9938398	test: 0.9368520	best: 0.9368520 (593)	total: 23.3s	remaining: 1h 4m 13s
700:	learn: 0.9969168	test: 0.9398496	best: 0.9398496 (652)	total: 27.6s	remaining: 1h 5m 11s
800:	learn: 0.9979424	test: 0.9435028	best: 0.9444967 (756)	total: 31.8s	remaining: 1h 5m 34s
900:	learn: 1.0000000	test: 0.9444967	best: 0.9444967 (756)	total: 35.7s	remaining: 1h 5m 28s
1000:	learn: 1.0000000	test: 0.9436090	be

<catboost.core.CatBoostClassifier at 0x163b57250>

Обработка твитов моделью Bert показала результаты намного лучше, при меньшей выборке, чем TfidfVectorizer. Добавление других признаком так же не привело к увеличению качества классификации. 