In [1]:
import json
import time
import requests
import warnings
import stanfordnlp

import pandas as pd
import numpy as np

from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from langdetect import detect, lang_detect_exception
from bs4 import BeautifulSoup
from multiprocessing import cpu_count

In [2]:
warnings.filterwarnings("ignore")

In [3]:
nlp_uk = stanfordnlp.Pipeline(lang='uk')

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_tokenizer.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_tagger.pt', 'pretrain_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu.pretrain.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_lemmatizer.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_parser.pt', 'pretrain_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu.pretrain.pt', 'lang': '

In [4]:
product_url_format = 'https://xl-catalog-api.rozetka.com.ua/v2/goods/get?front-type=xl&category_id={cat_id}&page={page_num}&sort=rank'

In [5]:
comment_url_format = 'https://product-api.rozetka.com.ua/v3/comments/get?front-type=xl&goods={product_id}&page={page_num}&sort=date&limit=10'

## Parse category

In [6]:
tv_cat = 80037
notebook_cat = 80004

In [7]:
def lang_detect(x):
    try:
        return detect(x)
    except lang_detect_exception.LangDetectException:
        return None


def get_data(cat_id, limit=10):

    product_url = product_url_format.format(cat_id=cat_id, page_num=1)
    resp = requests.get(product_url)
    body = json.loads(resp.text)
    pages_num_p = body['data']['total_pages']

    lst = []
    counter = 0
    for i in range(1, pages_num_p + 1):
        product_url = product_url_format.format(cat_id=cat_id, page_num=i)
        time.sleep(0.5)
        resp = requests.get(product_url)
        body = json.loads(resp.text)
        ids = body['data']['ids']

        for id_ in ids:
            comment_url = comment_url_format.format(product_id=id_, page_num=1)
            time.sleep(0.5)
            resp = requests.get(comment_url)
            body = json.loads(resp.text)
            pages_num_c = body['data']['pages']['count']

            for j in range(1, pages_num_c + 1):
                comment_url = comment_url_format.format(product_id=id_, page_num=j)
                time.sleep(0.5)
                resp = requests.get(comment_url)
                body = json.loads(resp.text)
                data = body['data']['comments']

                for item in data:
                    res = {'user': item['usertitle'], 'mark': item['mark'], 'text': item['text'], 
                           'pros': item['dignity'], 'cons': item['shortcomings']}
                    lang = lang_detect(res['text'].strip())
                    if lang == 'uk' and res not in lst:
                        lst.append(res)
                        counter += 1
                    
                    if counter >= limit:
                        return pd.DataFrame(lst)
                    elif counter % 100 == 0:
                        print(counter)
    return pd.DataFrame(lst)

### Parse

In [8]:
%%time
try:
    df_tv = pd.read_csv('reviews_tv.csv')
except FileNotFoundError:
    df_tv = get_data(tv_cat, limit=10000)
    df_tv.to_csv('reviews_tv.csv', index=False)

CPU times: user 19.9 ms, sys: 2.8 ms, total: 22.7 ms
Wall time: 21.6 ms


In [9]:
%%time
try:
    df_laptop = pd.read_csv('reviews_laptop.csv')
except FileNotFoundError:
    df_laptop = get_data(tv_cat, limit=10000)
    df_laptop.to_csv('reviews_laptop.csv', index=False)

CPU times: user 8.16 ms, sys: 0 ns, total: 8.16 ms
Wall time: 7.44 ms


### Preprocess

In [10]:
df = pd.concat([df_tv, df_laptop])

df = df.loc[(df.mark.notna()) & (df.mark > 0)]
df.fillna(" ", inplace=True)
df['target'] = np.where(df.mark < 3, 'neg', np.where(df.mark == 3, 'neu', 'pos'))
df['text'] = df.apply(lambda row: " ".join([row.text, row.pros, row.cons]).strip(), 1)
df['text'] = df.text.map(lambda x: BeautifulSoup(x).get_text())
df.drop(['user', 'mark', 'pros', 'cons'], 1, inplace=True)
df.reset_index(inplace=True)

In [11]:
print(df.shape)
df.head()

(2767, 3)


Unnamed: 0,index,text,target
0,2,В цьому телевізорі все чудово і різнокольорово...,pos
1,7,"Відмінний телевізор. Купив саме те, що хотів. ...",pos
2,8,"Замовив і був задоволений. Простий телевізор, ...",pos
3,9,"Чудовий телевізор, все працює. Чудова ціна. Бе...",pos
4,12,Чудовий телевізор,pos


In [12]:
target_map = {"neg": 0, "neu": 1, "pos": 2}
df.target = df.target.map(target_map)

In [13]:
df.target.value_counts()

2    2338
0     253
1     176
Name: target, dtype: int64

In [14]:
def tokenize(x):
    if len(x.strip()) == 0:
        return ""
    filter_pos = ('PUNCT', 'ADP', 'SYM', 'CCONJ', 'SCONJ', 'PROPN')
    filter_words = ["і", "та", "або", "й", "то", "б", "але"]
    sentences = nlp_uk(x).sentences
    res = []
    for sent in sentences:
        if '?' in list(sent.words)[-1].text:
            continue
        res.append([token.lemma for token in sent.words if token.upos not in filter_pos])
    return " ".join(list(filter(lambda x: x not in filter_words, sum(res, [])))).lower()

In [15]:
df.values

array([[2,
        'В цьому телевізорі все чудово і різнокольорово) чорний колір - чорний, звук бомба. Розмір 43. За такі гроші незнаю чи знайдете щось краще.',
        2],
       [7,
        'Відмінний телевізор. Купив саме те, що хотів. Розетка, як завжди - на висоті!! Дякую за співпрацю)) Відмінний телевізор. Купив саме те, що хотів. Розетка, як завжди - на висоті!!',
        2],
       [8,
        'Замовив і був задоволений. Простий телевізор, без наворотів, легке меню налаштування. Доставили за два дні, оперативно, дякую розетці. Яскраві кольори Маленькі ніжки. Раджу купувати одразу кріплення.',
        2],
       ...,
       [1997,
        'Добрий день. Ноутбук прийшов , все чудово але в простої температура процесора 50-60 градусів. Питання до представника , це норма? Так як під нагрузкою температура досягає 90-100 градусів',
        2],
       [1998,
        'Всі зазначені характеристики відповідають.  Літає, і5 8-го покоління і ССД свою роботу роблять. ВОТ на високих легко, тро

In [16]:
try:
    df = pd.read_csv('data.csv')
    df = df.loc[~df.text.isna()]
except FileNotFoundError:
    lst = []

    for item in tqdm(df.text.values):
        lst.append(tokenize(item))

In [17]:
df.head()

Unnamed: 0,text,target
0,цей телевізор все чудово різнокольорово чорний...,2
1,відмінний телевізор купити саме те хотіти розе...,2
2,замовити бути задоволений простий телевізор на...,2
3,чудовий телевізор все працювати чудовий ціна,2
4,чудовий телевізор,2


### Train model

#### NB baseline

In [18]:
RANDOM_STATE = 0

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [20]:
(data, target) = df['text'], df['target']

##### NB, CountVect

In [21]:
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE,
                                                    stratify=target)

In [22]:
c_vect = CountVectorizer()

X_train_vec = c_vect.fit_transform(X_train)
X_test_vec = c_vect.transform(X_test)

In [23]:
%%time

nb_model = GaussianNB()

nb_model.fit(X_train_vec.toarray(), y_train);

CPU times: user 320 ms, sys: 67.1 ms, total: 387 ms
Wall time: 387 ms


GaussianNB(priors=None, var_smoothing=1e-09)

In [24]:
y_pred = nb_model.predict(X_test_vec.toarray())

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.73      0.68        82
           1       0.34      0.86      0.48        56
           2       0.96      0.83      0.89       715

    accuracy                           0.82       853
   macro avg       0.64      0.81      0.68       853
weighted avg       0.89      0.82      0.84       853



##### trying tf-idf instead CountVect, standard params

In [25]:
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE,
                                                    stratify=target)

In [26]:
tfidf_vect = TfidfVectorizer()

X_train_vec = tfidf_vect.fit_transform(X_train)
X_test_vec = tfidf_vect.transform(X_test)

In [27]:
%%time

nb_model = GaussianNB()

nb_model.fit(X_train_vec.toarray(), y_train);

CPU times: user 188 ms, sys: 123 ms, total: 311 ms
Wall time: 309 ms


GaussianNB(priors=None, var_smoothing=1e-09)

In [28]:
y_pred = nb_model.predict(X_test_vec.toarray())

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.73      0.67        82
           1       0.35      0.86      0.50        56
           2       0.96      0.83      0.89       715

    accuracy                           0.82       853
   macro avg       0.64      0.81      0.68       853
weighted avg       0.88      0.82      0.84       853



### Trying another alrotithms and approaches

In [29]:
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE,
                                                    stratify=target)

In [30]:
class_weights = (1 / y_train.value_counts(normalize=True)).to_dict()

##### svc with lr with tf-idf on weighted classes

In [31]:
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

In [32]:
def train_eval(clf):
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    print("f1 macro:", f1_score(y_test, y_pred, average='macro'))
    print(clf)

In [33]:
tf_idf = TfidfVectorizer(min_df=5, max_df=0.75)

X_train_vec = tf_idf.fit_transform(X_train)
X_test_vec = tf_idf.transform(X_test)

In [34]:
# reg_interval = [0.01, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100]

# for i in reg_interval:
#     train_eval(LogisticRegression(C=i, class_weight=class_weights))

In [35]:
model_svc = LinearSVC(C=0.1, class_weight=class_weights)

model_svc.fit(X_train_vec, y_train);

In [36]:
y_pred = model_svc.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86        82
           1       0.83      0.61      0.70        56
           2       0.96      0.99      0.97       715

    accuracy                           0.95       853
   macro avg       0.89      0.81      0.84       853
weighted avg       0.94      0.95      0.94       853



In [37]:
model_lr = LogisticRegression(C=1, class_weight=class_weights)

model_lr.fit(X_train_vec, y_train);

In [38]:
y_pred = model_lr.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82        82
           1       0.69      0.89      0.78        56
           2       0.99      0.96      0.98       715

    accuracy                           0.95       853
   macro avg       0.83      0.90      0.86       853
weighted avg       0.95      0.95      0.95       853



#### trying pipeline

In [39]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from scipy.stats import uniform
from sklearn.metrics import make_scorer
from sklearn.decomposition import TruncatedSVD

In [40]:
tf_idf = FeatureUnion([
    ('TfIdf_Unigram', TfidfVectorizer(min_df=5, max_df=0.75, ngram_range=(1, 1), strip_accents='unicode')),
    ('TfIdf_Bigram',  TfidfVectorizer(min_df=2, max_df=0.75, ngram_range=(2, 2), strip_accents='unicode'))
])

In [41]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [42]:
N_COMP = 400

pipeline = Pipeline([
    ("main_union", FeatureUnion([
        ("pipe1", Pipeline([
            ('tf_idf', tf_idf),
        ])),
        ("pipe2", Pipeline([
            ('tf_idf', tf_idf),
            ("SVD", TruncatedSVD(n_components=N_COMP))
        ])),
    ])),
#     ('LinearSVC', LinearSVC(class_weight=class_weights))
    ("LogReg", LogisticRegression(max_iter=1000, class_weight=class_weights))
])

distributions = {
#     "LinearSVC__C": [0.5, 1, 5],
    "LogReg__C": [0.5, 1, 5, 10],
    "LogReg__penalty": ["l2"],
}
clf = RandomizedSearchCV(pipeline,
                         distributions,
                         random_state=0,
                         scoring=make_scorer(f1_macro),
                         n_iter=10,
                         cv=5,
                         verbose=5,
                         n_jobs=-1)
search = clf.fit(X_train, y_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  20 | elapsed:   30.8s remaining:  4.6min
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:   32.3s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  12 out of  20 | elapsed:   33.0s remaining:   22.0s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:   51.8s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   52.5s finished


{'LogReg__penalty': 'l2', 'LogReg__C': 10} 0.8612784203443491


In [43]:
y_pred = search.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91        82
           1       0.94      0.89      0.92        56
           2       0.99      0.99      0.99       715

    accuracy                           0.98       853
   macro avg       0.95      0.93      0.94       853
weighted avg       0.98      0.98      0.98       853



#### out-of-fold LGB training

In [44]:
from sklearn.model_selection import StratifiedKFold
from lightgbm import plot_importance, LGBMClassifier

In [45]:
use_sample_weight = True
N_FOLDS = 4
num_threads = cpu_count()

https://lightgbm.readthedocs.io/en/latest/Parameters.html

In [46]:
params = {
    'num_class': 3,
    'num_rounds': 2000,
    'max_depth': -1, # grows unlimited trees (bigger chance of overfitting, but we reduce it by making k-fold bagging)
    'learning_rate': 0.01,
    'num_leaves': 31, # default value, as our trees are deep, we limit the number of leaves to avoid overfitting
    'verbose': 100,
    'early_stopping_rounds': 300,
    'min_data_in_leaf': 30, # as we are growing deep trees and have small data, this parameter should be small
    'lambda_l2': 0.7,  # adding more generalization
    'feature_fraction': 0.2, #  we already have small and wide dataset, so it's better to train trees on lower amount of features
    'metric': 'custom',
    'random_state': RANDOM_STATE
}

classifier = LGBMClassifier(**params)

In [47]:
def lgb_fscore(y_true, y_pred):
    y_pred = y_pred.reshape(len(np.unique(y_true)), -1)
    y_pred = y_pred.argmax(axis=0)
    res = f1_score(y_true, y_pred, average='macro')
    return 'macro_f1', res, True

In [48]:
strategy = StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_STATE, shuffle=True)

In [49]:
train = df.loc[X_train.index]

sample_weight = y_train.map(class_weights).values

test = df.loc[X_test.index]

In [50]:
pred_oof = np.zeros(len(train), dtype=np.float32)
pred_test = np.zeros((len(test), params['num_class'], N_FOLDS), dtype=np.float32)
fold_metrics = np.zeros(N_FOLDS)

In [51]:
for i, (tr_ind, val_ind) in enumerate(strategy.split(X=np.ones(len(train)), y=train['target'])):
    print(f'Fold: {i + 1}\n\tTrain len: {len(tr_ind)}\n\tVal len: {len(val_ind)}')
    pipe = Pipeline([
            ('TFIDF', tf_idf),
            ("SVD", TruncatedSVD(n_components=N_COMP))
        ])
    pipe.fit(train.iloc[tr_ind]['text'])
    
    X = pipe.transform(train.iloc[tr_ind]['text'].copy())
    y = train.iloc[tr_ind]['target'].copy()
    X_val = pipe.transform(train.iloc[val_ind]['text'].copy())
    y_val = train.iloc[val_ind]['target'].copy()
    X_test_ = pipe.transform(test['text'])
    
    # fit model
    print('\tFITTING MODEL...')
    classifier.fit(
        X=X,
        y=y,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=params['early_stopping_rounds'],
        verbose=params['verbose'],
        eval_metric=lgb_fscore,
        sample_weight=sample_weight[tr_ind] if use_sample_weight else None,
    )
    # predict OOF val
    print('\tPREDICT OOF...')
    pred_oof[val_ind] = classifier.predict(X_val, num_threads=num_threads)
    # predict test
    print('\tPREDICTING TEST...')
    pred_test[..., i] = classifier.predict_proba(
        X_test_, num_threads=num_threads)
    fold_metrics[i] = f1_macro(y_val, pred_oof[val_ind])
    print(f'\tFold score: {fold_metrics[i]}')

Fold: 1
	Train len: 2558
	Val len: 853
	FITTING MODEL...
Training until validation scores don't improve for 300 rounds
[100]	valid_0's macro_f1: 0.835674
[200]	valid_0's macro_f1: 0.847802
[300]	valid_0's macro_f1: 0.854253
[400]	valid_0's macro_f1: 0.856063
[500]	valid_0's macro_f1: 0.84832
[600]	valid_0's macro_f1: 0.849368
Early stopping, best iteration is:
[381]	valid_0's macro_f1: 0.859194
	PREDICT OOF...
	PREDICTING TEST...
	Fold score: 0.8591937897493453
Fold: 2
	Train len: 2558
	Val len: 853
	FITTING MODEL...
Training until validation scores don't improve for 300 rounds
[100]	valid_0's macro_f1: 0.81558
[200]	valid_0's macro_f1: 0.833498
[300]	valid_0's macro_f1: 0.832767
[400]	valid_0's macro_f1: 0.834282
[500]	valid_0's macro_f1: 0.846571
[600]	valid_0's macro_f1: 0.852794
[700]	valid_0's macro_f1: 0.843056
[800]	valid_0's macro_f1: 0.825933
Early stopping, best iteration is:
[554]	valid_0's macro_f1: 0.852794
	PREDICT OOF...
	PREDICTING TEST...
	Fold score: 0.852794327675185

In [52]:
print(f'Total score: ', f1_macro(train['target'], pred_oof))

Total score:  0.8546972617594372


In [53]:
y_pred_raw = pred_test.mean(axis=-1)
y_pred = y_pred_raw.argmax(axis=1).astype(np.int32)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88        82
           1       1.00      0.79      0.88        56
           2       0.97      0.99      0.98       715

    accuracy                           0.97       853
   macro avg       0.96      0.87      0.91       853
weighted avg       0.97      0.97      0.96       853



Actually, in practice oof works better and avoids overfitting. But, I didn't use X_test, just to make it possible easy compare results with previous models.

What can also be done:
* add words tonal features from tone-dict-uk.tsv. and concat as sparse matrix
* replace words with tones by tokens like `<positive>` or `<negative>`
* classical features like `word_num`, `text_len`, `has_pros`, `has_cons`, `mean_words_tone` etc. But I'm not sure if it possilbe due to task limitations, as it should be only BoW
* get BoW features w/o pros and cons concatenation

Currently, pipeline with un and bi-grams extended by SVD features shows the best result, not suffering recall for neu class.

### Model explain with eli5

In [54]:
import eli5

In [55]:
eli5.show_weights(model_lr, vec=tf_idf)

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+3.158,TfIdf_Unigram__добрии,
+2.909,TfIdf_Unigram__відношення,
+2.781,TfIdf_Unigram__зверху,
+2.747,TfIdf_Unigram__повністю,
+2.596,TfIdf_Unigram__15,
+2.560,TfIdf_Unigram__розбиратися,
+2.528,TfIdf_Unigram__надіясь,
+2.516,TfIdf_Unigram__поки,
+2.516,TfIdf_Unigram__розбити,
+2.330,TfIdf_Unigram__чисто,

Weight?,Feature
+3.158,TfIdf_Unigram__добрии
+2.909,TfIdf_Unigram__відношення
+2.781,TfIdf_Unigram__зверху
+2.747,TfIdf_Unigram__повністю
+2.596,TfIdf_Unigram__15
+2.560,TfIdf_Unigram__розбиратися
+2.528,TfIdf_Unigram__надіясь
+2.516,TfIdf_Unigram__поки
+2.516,TfIdf_Unigram__розбити
+2.330,TfIdf_Unigram__чисто

Weight?,Feature
+3.298,TfIdf_Unigram__необхіднии
+3.062,TfIdf_Unigram__знаходитися
+2.859,TfIdf_Unigram__1080р
+2.776,TfIdf_Unigram__смартфон
+2.683,TfIdf_Unigram__порт
+2.659,TfIdf_Unigram__годинник
+2.554,TfIdf_Unigram__приставка
+2.543,TfIdf_Bigram__40 цифровии
+2.491,TfIdf_Unigram__тюнера
+2.441,TfIdf_Unigram__слабии

Weight?,Feature
+4.083,TfIdf_Unigram__кабельнии
+3.225,TfIdf_Bigram__4k більшість
+2.887,TfIdf_Bigram__10 підключення
+2.785,TfIdf_Bigram__50 діагональ
+2.695,TfIdf_Unigram__лінія
+2.398,TfIdf_Unigram__гарантія
+2.161,TfIdf_Unigram__різник
… 1425 more positive …,… 1425 more positive …
… 1048 more negative …,… 1048 more negative …
-2.131,TfIdf_Unigram__зверху
