In [1]:
import json
import time
import requests
import warnings

import pandas as pd
import numpy as np

from langdetect import detect, lang_detect_exception
from bs4 import BeautifulSoup

import stanfordnlp

warnings.filterwarnings("ignore")

from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count

In [2]:
nlp_uk = stanfordnlp.Pipeline(lang='uk')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_tokenizer.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_tagger.pt', 'pretrain_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu.pretrain.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_lemmatizer.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu_parser.pt', 'pretrain_path': '/home/dima/stanfordnlp_resources/uk_iu_models/uk_iu.pretrain.pt', 'lang': '

In [3]:
product_url_format = 'https://xl-catalog-api.rozetka.com.ua/v2/goods/get?front-type=xl&category_id={cat_id}&page={page_num}&sort=rank'

In [4]:
comment_url_format = 'https://product-api.rozetka.com.ua/v3/comments/get?front-type=xl&goods={product_id}&page={page_num}&sort=date&limit=10'

## Parse category

In [5]:
tv_cat = 80037
notebook_cat = 80004

In [6]:
def lang_detect(x):
    try:
        return detect(x)
    except lang_detect_exception.LangDetectException:
        return None


def get_data(cat_id, limit=10):

    product_url = product_url_format.format(cat_id=cat_id, page_num=1)
    resp = requests.get(product_url)
    body = json.loads(resp.text)
    pages_num_p = body['data']['total_pages']

    lst = []
    counter = 0
    for i in range(1, pages_num_p + 1):
        product_url = product_url_format.format(cat_id=cat_id, page_num=i)
        time.sleep(0.5)
        resp = requests.get(product_url)
        body = json.loads(resp.text)
        ids = body['data']['ids']

        for id_ in ids:
            comment_url = comment_url_format.format(product_id=id_, page_num=1)
            time.sleep(0.5)
            resp = requests.get(comment_url)
            body = json.loads(resp.text)
            pages_num_c = body['data']['pages']['count']

            for j in range(1, pages_num_c + 1):
                comment_url = comment_url_format.format(product_id=id_, page_num=j)
                time.sleep(0.5)
                resp = requests.get(comment_url)
                body = json.loads(resp.text)
                data = body['data']['comments']

                for item in data:
                    res = {'user': item['usertitle'], 'mark': item['mark'], 'text': item['text'], 
                           'pros': item['dignity'], 'cons': item['shortcomings']}
                    lang = lang_detect(res['text'].strip())
                    if lang == 'uk' and res not in lst:
                        lst.append(res)
                        counter += 1
                    
                    if counter >= limit:
                        return pd.DataFrame(lst)
#                     elif counter % 100 == 0:
#                         print(counter)
    return pd.DataFrame(lst)

### Parse

In [7]:
%%time
try:
    df_tv = pd.read_csv('reviews_tv.csv')
except FileNotFoundError:
    df_tv = get_data(tv_cat, limit=10000)
    df_tv.to_csv('reviews_tv.csv', index=False)

CPU times: user 22.3 ms, sys: 774 µs, total: 23 ms
Wall time: 22 ms


In [8]:
%%time
try:
    df_laptop = pd.read_csv('reviews_laptop.csv')
except FileNotFoundError:
    df_laptop = get_data(tv_cat, limit=10000)
    df_laptop.to_csv('reviews_laptop.csv', index=False)

CPU times: user 22.5 ms, sys: 855 µs, total: 23.4 ms
Wall time: 22.3 ms


### Preprocess

In [9]:
df = pd.concat([df_tv, df_laptop])

df = df.loc[(df.mark.notna()) & (df.mark > 0)]
df.fillna(" ", inplace=True)
df['target'] = np.where(df.mark < 3, 'neg', np.where(df.mark == 3, 'neu', 'pos'))
df['text'] = df.apply(lambda row: " ".join([row.text, row.pros, row.cons]).strip(), 1)
df['text'] = df.text.map(lambda x: BeautifulSoup(x).get_text())
df.drop(['user', 'mark', 'pros', 'cons'], 1, inplace=True)
df.reset_index(inplace=True)

In [10]:
print(df.shape)
df.head()

(4414, 3)


Unnamed: 0,index,text,target
0,2,В цьому телевізорі все чудово і різнокольорово...,pos
1,7,"Відмінний телевізор. Купив саме те, що хотів. ...",pos
2,8,"Замовив і був задоволений. Простий телевізор, ...",pos
3,9,"Чудовий телевізор, все працює. Чудова ціна. Бе...",pos
4,12,Чудовий телевізор,pos


In [11]:
target_map = {"neg": 0, "neu": 1, "pos": 2}
df.target = df.target.map(target_map)

In [12]:
df.target.value_counts()

2    3708
0     417
1     289
Name: target, dtype: int64

In [13]:
def tokenize(x):
    filter_pos = ('PUNCT', 'ADP', 'SYM', 'CCONJ', 'SCONJ', 'PROPN')
    filter_words = ["і", "та", "або", "й", "то", "б", "але"]
    sentences = nlp_uk(x).sentences
    res = []
    for sent in sentences:
        if '?' in list(sent.words)[-1].text:
            continue
        res.append([token.lemma for token in sent.words if token.upos not in filter_pos])
    return " ".join(list(filter(lambda x: x not in filter_words, sum(res, [])))).lower()

In [14]:
try:
    df = pd.read_csv('data.csv')
    df = df.loc[~df.text.isna()]
except FileNotFoundError:
    lst = []

    for item in tqdm(df.text.values):
        lst.append(tokenize(item))

    df['text'] = lst

### Train model

#### Baseline

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.metrics import *

In [67]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], 
                                                    test_size=0.2, 
                                                    random_state=0,
                                                    stratify=df['target'])

In [68]:
tf_idf = TfidfVectorizer(min_df=5, max_df=0.75)

In [69]:
tf_idf.fit(X_train)

X_train_vec = tf_idf.transform(X_train)
X_test_vec = tf_idf.transform(X_test)

In [70]:
def train_eval(clf):
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    print("f1 macro:", f1_score(y_test, y_pred, average='macro'))
    print(clf)

In [71]:
# reg_interval = [0.01, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100]

# for i in reg_interval:
#     train_eval(LinearSVC(C=i))

In [72]:
model = LinearSVC(C=1)

model.fit(X_train_vec, y_train);

In [73]:
y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88        82
           1       0.95      0.68      0.79        56
           2       0.97      0.99      0.98       715

    accuracy                           0.96       853
   macro avg       0.94      0.85      0.88       853
weighted avg       0.96      0.96      0.96       853



#### trying pipeline

In [74]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from scipy.stats import uniform
from sklearn.metrics import make_scorer

In [80]:
tf_idf = FeatureUnion([
    ('TfIdf_Unigram', TfidfVectorizer(min_df=5, max_df=0.75, ngram_range=(1, 1), strip_accents='unicode')),
    ('TfIdf_Bigram',  TfidfVectorizer(min_df=2, max_df=0.75, ngram_range=(2, 2), strip_accents='unicode'))
])

In [81]:
class_weights = (1 / y_train.value_counts(normalize=True)).to_dict()

In [82]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [83]:
N_COMP = 400

pipeline = Pipeline([
    ("main_union", FeatureUnion([
        ("pipe1", Pipeline([
            ('tf_idf', tf_idf),
        ])),
        ("pipe2", Pipeline([
            ('tf_idf', tf_idf),
            ("SVD", TruncatedSVD(n_components=N_COMP))
        ])),
    ])),
    ('LinearSVC', LinearSVC(class_weight=class_weights))
#     ("LogReg", LogisticRegression(max_iter=300))
])

distributions = {
    "LinearSVC__C": [0.5, 1, 5],
#     "LogReg__C": [0.5, 1, 5, 10],
#     "LogReg__penalty": ["l2"],
}
clf = RandomizedSearchCV(pipeline,
                         distributions,
                         random_state=0,
                         scoring=make_scorer(f1_macro),
                         n_iter=10,
                         cv=5,
                         verbose=5,
                         n_jobs=-1)
search = clf.fit(X_train, y_train)
print(search.best_params_, search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   29.6s remaining:  1.4min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   30.4s remaining:   26.6s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   32.5s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   38.8s finished


{'LinearSVC__C': 1} 0.8597509332905282


In [66]:
y_pred = search.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90        82
           1       1.00      0.86      0.92        56
           2       0.98      0.99      0.99       715

    accuracy                           0.98       853
   macro avg       0.96      0.91      0.94       853
weighted avg       0.98      0.98      0.98       853



#### out-of-fold LGB training

In [30]:
from sklearn.model_selection import StratifiedKFold
from lightgbm import plot_importance, LGBMClassifier

In [31]:
params = {
#     'objective': 'multiclass',
    'num_class': 3,
    'num_rounds': 1000,
    'max_depth': -1, #  8
    'learning_rate': 0.01,  #  0.007
    'num_leaves': 31, # was 127
    'verbose': 100,
    'early_stopping_rounds': 300,
    'min_data_in_leaf': 20,
    'lambda_l2': 0.7,
    'feature_fraction': 0.2, #  0.8
    'metric': 'custom',
}

# class_weigths = (
#     np.log1p(1/y_train.value_counts(normalize=True)
#     )
# ).to_dict()

classifier = LGBMClassifier(**params)

# train_ind = df_features.train_dev == 'train'
# val_ind = df_features.train_dev == 'dev'
# X_tr = df_features[train_ind].drop(columns=non_features)
# X_val = df_features[val_ind].drop(columns=non_features)
# y_tr = df_features.loc[train_ind, TARGET].astype(np.int32)
# y_val = df_features.loc[val_ind, TARGET].astype(np.int32)

# print(X_tr.shape, y_tr.shape)
# print(X_val.shape, y_val.shape)

In [32]:
def lgb_fscore(y_true, y_pred):
    y_pred = y_pred.reshape(len(np.unique(y_true)), -1)
    y_pred = y_pred.argmax(axis=0)
    res = f1_score(y_true, y_pred, average='macro')
    return 'macro_f1', res, True

In [33]:
# tweak it to see results
use_sample_weight = True
# perform validation strategy
N_FOLDS = 4
strategy = StratifiedKFold(n_splits=N_FOLDS, random_state=0, shuffle=True)

In [34]:
train = df.loc[X_train.index]

sample_weight = y_train.map(class_weights).values

test = df.loc[X_test.index]

In [35]:
pred_oof = np.zeros(len(train), dtype=np.float32)
pred_test = np.zeros(
    (len(test), params['num_class'], N_FOLDS), dtype=np.float32)

In [36]:
fold_metrics = np.zeros(N_FOLDS)

In [37]:
num_threads = cpu_count()

In [38]:
for i, (tr_ind, val_ind) in enumerate(strategy.split(X=np.ones(len(train)), y=train['target'])):
    print(
        f'Fold: {i + 1}\n\tTrain len: {len(tr_ind)}\n\tVal len: {len(val_ind)}')
    # split tr/val
    pipe = Pipeline([
            ('TFIDF', TFIDF),
            ("SVD", TruncatedSVD(n_components=N_COMP))
        ])
    pipe.fit(train.iloc[tr_ind]['text'])
    
    X = pipe.transform(train.iloc[tr_ind]['text'].copy())
    y = train.iloc[tr_ind]['target'].copy()
    X_val = pipe.transform(train.iloc[val_ind]['text'].copy())
    y_val = train.iloc[val_ind]['target'].copy()
    X_test = pipe.transform(test['text'])
    
    
    
    # fit model
    print('\tFITTING MODEL...')
    classifier.fit(
        X=X,
        y=y,
        eval_set=[(X_val, y_val)],
#        early_stopping_rounds=params['early_stopping_rounds'],
        verbose=params['verbose'],
        eval_metric=lgb_fscore,
        sample_weight=sample_weight[tr_ind] if use_sample_weight else None,
    )
    # predict OOF val
    print('\tPREDICT OOF...')
    pred_oof[val_ind] = classifier.predict(X_val, num_threads=num_threads)
    # predict test
    print('\tPREDICTING TEST...')
    pred_test[..., i] = classifier.predict_proba(
        X_test, num_threads=num_threads)
    fold_metrics[i] = f1_macro(y_val, pred_oof[val_ind])
    print(f'\tFold score: {fold_metrics[i]}')

Fold: 1
	Train len: 2558
	Val len: 853
	FITTING MODEL...
Training until validation scores don't improve for 300 rounds
[100]	valid_0's macro_f1: 0.830107
[200]	valid_0's macro_f1: 0.840658
[300]	valid_0's macro_f1: 0.843016
[400]	valid_0's macro_f1: 0.842428
Early stopping, best iteration is:
[128]	valid_0's macro_f1: 0.850816
	PREDICT OOF...
	PREDICTING TEST...
	Fold score: 0.8508164885439063
Fold: 2
	Train len: 2558
	Val len: 853
	FITTING MODEL...
Training until validation scores don't improve for 300 rounds
[100]	valid_0's macro_f1: 0.796662
[200]	valid_0's macro_f1: 0.80872
[300]	valid_0's macro_f1: 0.811773
[400]	valid_0's macro_f1: 0.829004
[500]	valid_0's macro_f1: 0.840366
[600]	valid_0's macro_f1: 0.8347
[700]	valid_0's macro_f1: 0.828892
Early stopping, best iteration is:
[472]	valid_0's macro_f1: 0.840366
	PREDICT OOF...
	PREDICTING TEST...
	Fold score: 0.8403658228487697
Fold: 3
	Train len: 2558
	Val len: 853
	FITTING MODEL...
Training until validation scores don't improve 

In [41]:
print(f'Total score: ', f1_macro(train['target'], pred_oof))

Total score:  0.854832206399943


In [42]:
y_pred_raw = pred_test.mean(axis=-1)
y_pred = y_pred_raw.argmax(axis=1).astype(np.int32)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91        82
           1       1.00      0.79      0.88        56
           2       0.97      0.99      0.98       715

    accuracy                           0.97       853
   macro avg       0.97      0.89      0.92       853
weighted avg       0.97      0.97      0.97       853



Actually, in practice oof works better and avoids overfitting. But, I didn't use X_test, just to make it possible easy compare results with previous models.

What can also be done:
* add words tonal features from tone-dict-uk.tsv. and concat as sparse matrix
* replace words with tones by tokens like `<positive>` or `<negative>`
* classical features like `word_num`, `text_len`, `has_pros`, `has_cons`, `mean_words_tone` etc. But I'm not sure if it possilbe due to task limitations, as it should be only BoW
* get BoW features w/o pros and cons concatenation

Currently, pipeline with un and bi-grams extended by SVD features shows the best result, not suffering recall for neu class.