In [1]:
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import numpy as np, pandas as pd, dill as pickle

In [None]:
# # quasi-requirements:
# !pip install -U scikit-learn, xgboost, pandas, dill, tqdm

# # if use postgres
# ! pip install psycopg2, psycopg2-binary, sqlalchemy

In [2]:
with open('path_to_postgresql_credentials.txt') as f:
    psql = create_engine(f.read())

download annotated data

In [63]:
ru_ann = pd.read_csv('ru_annotation.csv')
ru_ann = ru_ann.loc[ru_ann.is_other == 0].copy()

uk_ann = pd.read_csv('uk_annotation.csv')
uk_ann = uk_ann.loc[uk_ann.is_other == 0].copy()

# to use in postgres query - if you will get texts in other way - don't do that
ru_ann_ids = ', '.join(ru_ann.html_id.astype(str).values)
uk_ann_ids = ', '.join(uk_ann.html_id.astype(str).values)

## tf-idf

Build TF-IDF vectorizer on random sample of 100k texts for use in training.<br>
We do it because there are not so much annotated data, so it is reasonable to use a lot of collected but unlabeled data to train vectorizer.

In [None]:
ru_ids = ', '.join(pd.read_sql("select html_id from htmls where is_other < 0.68 and lang = 'ru';", psql
                              ).sample(100000).html_id.astype(str).values
                  )
ru_texts = pd.read_sql(f'''select html_id, tokenized from htmls where html_id in ({ru_ids})''',
                       psql, chunksize=10000)
del ru_ids

uk_ids = ', '.join(pd.read_sql("select html_id from htmls where is_other < 0.68 and lang = 'uk';", psql
                              ).sample(100000).html_id.astype(str).values
                  )
uk_texts = pd.read_sql(f'''select html_id, tokenized from htmls where html_id in ({uk_ids})''',
                       psql, chunksize=10000)
del uk_ids

In [27]:
uk_tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=70000, max_df=0.97)
ru_tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=70000, max_df=0.97)

In [34]:
def text_gen(dfs):
    for df in tqdm(dfs):
        for text in df.tokenized.values:
            if text: yield text

In [None]:
uk_tfidf.fit(text_gen(uk_texts))
ru_tfidf.fit(text_gen(ru_texts))

In [38]:
# # save it
# with open('uk_tfidf.pkl', 'wb') as ukf, open('ru_tfidf.pkl', 'wb') as ruf:
#     pickle.dump(uk_tfidf, ukf)
#     pickle.dump(ru_tfidf, ruf)

## xgboost

In [4]:
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.metrics import *

download vectorizers

In [5]:
with open('uk_tfidf.pkl', 'rb') as ukf, open('ru_tfidf.pkl', 'rb') as ruf:
    uk_tfidf = pickle.load(ukf)
    ru_tfidf = pickle.load(ruf)

Get tokenized texts from DB. There is a script to preprocess and tokenize in `data_collection` folder of this repo

In [64]:
uk_ann = uk_ann.merge(
    pd.read_sql(f'''
    select html_id, lower(tokenized) as tokenized from htmls where html_id in ({uk_ann_ids})
    ''', psql), on='html_id', how='left'
)

ru_ann = ru_ann.merge(
    pd.read_sql(f'''
    select html_id, lower(tokenized) as tokenized from htmls where html_id in ({ru_ann_ids})
    ''', psql), on='html_id', how='left'
)

In [65]:
# add test set to val - just to increase size of validation set

ru_test = pd.read_json('ru_test_set.jl', lines=True
           ).reindex(['html_id', 'emo_check', 'arg_check', 'is_other', 'trn_val'], axis=1)
uk_test = pd.read_json('uk_test_set.jl', lines=True
           ).reindex(['html_id', 'emo_check', 'arg_check', 'is_other', 'trn_val'], axis=1)

ru_test.is_other = 0
uk_test.is_other = 0

ru_test.trn_val = 'val'
uk_test.trn_val = 'val'

uk_test = uk_test.dropna()

ru_test_ids = ', '.join(ru_test.html_id.astype(str).values)
uk_test_ids = ', '.join(uk_test.html_id.astype(str).values)

uk_test = uk_test.merge(
    pd.read_sql(f'''
    select html_id, lower(tokenized) as tokenized
    from htmls
    where html_id in ({uk_test_ids})
    ''', psql), on='html_id', how='left'
).dropna()

ru_test = ru_test.merge(
    pd.read_sql(f'''
    select html_id, lower(tokenized) as tokenized
    from htmls where html_id in ({ru_test_ids})
    ''', psql), on='html_id', how='left'
).dropna()

ru_test.columns = ['html_id', 'is_emo', 'is_arg', 'is_other', 'trn_val', 'tokenized']
uk_test.columns = ['html_id', 'is_emo', 'is_arg', 'is_other', 'trn_val', 'tokenized']

split into train and test sets

In [66]:
uk_trn = uk_ann.loc[(uk_ann.trn_val == 'trn') & uk_ann.tokenized.notnull()].copy().sample(frac=1)
ru_trn = ru_ann.loc[(ru_ann.trn_val == 'trn') & ru_ann.tokenized.notnull()].copy().sample(frac=1)


uk_val = pd.concat([uk_test,
                    uk_ann.loc[(uk_ann.trn_val == 'val') & uk_ann.tokenized.notnull()].copy()]
          ).drop_duplicates('html_id'
          ).sample(frac=1)
uk_val = uk_ann.loc[(uk_ann.trn_val == 'val') & uk_ann.tokenized.notnull()].copy().sample(frac=1)

ru_val = pd.concat([ru_test,
                    ru_ann.loc[(ru_ann.trn_val == 'val') & ru_ann.tokenized.notnull()].copy()]
          ).drop_duplicates('html_id'
          ).sample(frac=1)

### The classifier itself - xgboost

In [8]:
cls_ru_emo = xgb.sklearn.XGBClassifier()
cls_ru_arg = xgb.sklearn.XGBClassifier()
cls_uk_emo = xgb.sklearn.XGBClassifier()
cls_uk_arg = xgb.sklearn.XGBClassifier()

In [9]:
uk_trn_vectorized = uk_tfidf.transform(uk_trn.tokenized)
cls_uk_emo.fit(uk_trn_vectorized, uk_trn.is_emo)
cls_uk_arg.fit(uk_trn_vectorized, uk_trn.is_arg)

ru_trn_vectorized = ru_tfidf.transform(ru_trn.tokenized)
cls_ru_emo.fit(ru_trn_vectorized, ru_trn.is_emo)
cls_ru_arg.fit(ru_trn_vectorized, ru_trn.is_arg)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

test it

In [10]:
pipe_ru_emo = make_pipeline(ru_tfidf, cls_ru_emo)
pipe_ru_arg = make_pipeline(ru_tfidf, cls_ru_arg)
pipe_uk_emo = make_pipeline(uk_tfidf, cls_uk_emo)
pipe_uk_arg = make_pipeline(uk_tfidf, cls_uk_arg)

In [67]:
ru_val['pred_emo'] = pipe_ru_emo.predict_proba(ru_val.tokenized)[:, 1]
uk_val['pred_emo'] = pipe_uk_emo.predict_proba(uk_val.tokenized)[:, 1]
ru_val['pred_arg'] = pipe_ru_arg.predict_proba(ru_val.tokenized)[:, 1]
uk_val['pred_arg'] = pipe_uk_arg.predict_proba(uk_val.tokenized)[:, 1]

In [68]:
def print_metrics(df, which, thr):
    '''
    A simple function to print metrics.
    df - data frame with val data
    which - prefix "emo" or "arg"
    thr - threshold from where we classify value as positive
    '''
    kwargs = dict(y_true=df[f'is_{which}'], y_pred=df[f'pred_{which}'] > thr, labels=[True, False])
    print(classification_report(**kwargs))
    cm = confusion_matrix(**kwargs)
    print(cm / cm.sum(1)[:, None])
    print(cm)

In baseline we select threshold so that there will be at least some classified true positives and false positive value is less than 10% and there are more true positives than false positives

In [69]:
print('Ru emo')
print_metrics(ru_val, 'emo', 0.5)

print('\nUk emo')
print_metrics(uk_val, 'emo', 0.3)

print('\nRu arg')
print_metrics(ru_val, 'arg', 0.1)

print('\nUk arg')
print_metrics(uk_val, 'arg', 0.05)

Ru emo
              precision    recall  f1-score   support

        True       0.47      0.42      0.45       280
       False       0.89      0.91      0.90      1433

   micro avg       0.83      0.83      0.83      1713
   macro avg       0.68      0.67      0.67      1713
weighted avg       0.82      0.83      0.82      1713

[[0.42142857 0.57857143]
 [0.09141661 0.90858339]]
[[ 118  162]
 [ 131 1302]]

Uk emo
              precision    recall  f1-score   support

        True       0.43      0.21      0.28        48
       False       0.79      0.92      0.85       153

   micro avg       0.75      0.75      0.75       201
   macro avg       0.61      0.56      0.56       201
weighted avg       0.70      0.75      0.71       201

[[0.20833333 0.79166667]
 [0.08496732 0.91503268]]
[[ 10  38]
 [ 13 140]]

Ru arg
              precision    recall  f1-score   support

        True       0.50      0.22      0.31       342
       False       0.83      0.94      0.88      1371

   micr