In [1]:
import pandas as pd
import numpy as np
import os
import tokenize_uk
from typing import List
import langdetect

# Data

In this task we selected computers and notebooks categories in rozetka.com. So we scribed the review comments data with mentined star for the next sites:

+ https://hard.rozetka.com.ua/ua/computers/c80095/ 
+ https://hard.rozetka.com.ua/ua/computers/c80095/
+ https://hard.rozetka.com.ua/

In [2]:
data_dir = "data"

In [3]:
! ls data

comments_urls.txt  rozetka-hard-comments_all.csv       rozetka-reviews-uk.csv
hard_urls.txt	   rozetka-monitor-comments_all.csv    ukrainian-stopwords.txt
pc_urls.txt	   rozetka-notebooks-comments_all.csv
router_urls.txt    rozetka-pc-comments_all.csv


In [4]:
def read_csv_from_dir(directory):
    frames = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            df = pd.read_csv(f"{directory}/{filename}")
            frames.append(df)
            
    result = pd.concat(frames)
    return result

In [5]:
df = pd.read_csv("rozetka-hard-comments-all.csv")

In [6]:
len(df)

69678

In [7]:
df = df.fillna("")

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,pros,cons,rating
0,0,Народний проц,12 потоков і розгон,Ціна через вірус,5
1,1,"после fx6300, 1600af ракета.",,,5
2,2,"Норм проц, взял к нему в450, работает стабильн...","Охлаждение, производительность",За 80$ их просто нет,5
3,3,"Топ за свої гроші Новий тех процес 12нм , по ф...","Ціна, новий техпроцес",,5
4,4,"За свою цену это просто незаменимый проц, лучш...",Цена Качество Хороший и тихий кулер Производит...,,5


#### Traslate data

In [12]:
import os
from google.cloud import translate_v2 as translate

In [13]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/dbabenko/Downloads/silken-vial-263606-d3530327b6b3.json'

In [14]:
translate_client = translate.Client()

In [15]:
def translate_ru_uk(translate_client, text_ru: str):
    translated = translate_client.translate(text_ru, source_language='ru', target_language='uk')
    text_uk = translated['translatedText']
    return text_uk

In [16]:
translate_ru_uk(translate_client, 'в новом корпусе, но - матрац - старый с...')

'в новому корпусі, але - матрац - старий з ...'

In [17]:
def detect_language(text: str):
    try:
        return langdetect.detect(text)
    except:
        return None

In [18]:
def translate_if_needed(text: str):
    if len(text) == 0:
        return ""
    
    try:
        
        lang = detect_language(text)

        if lang is None:
            return ""

        if lang == 'uk':
            return text

        if lang == 'ru':
            return translate_ru_uk(translate_client, text)
    except:
        return ""
    
    return ""
    


In [19]:
def tranlate_ru_language_and_filter_empty_data(df):
    title_list = df['title'].values
    pros_list = df['pros'].values
    cons_list = df['cons'].values
    rate_list = df['rating'].values
    
    result = {
    'title': [],
    'pros' : [],
    'cons' : [],
    'rating' : []
}
    
    for i in range(0, len(title_list)):
        title = title_list[i].strip()
        pros = pros_list[i].strip()
        cons = cons_list[i].strip()
        
        if len(title) == 0 and len(pros) == 0 and len(cons) == 0:
            continue
            
        title = translate_if_needed(title)
        pros = translate_if_needed(pros)
        cons = translate_if_needed(cons)
        
        if len(title) == 0 and len(pros) == 0 and len(cons) == 0:
            continue

            
        result['title'].append(title)
        result['pros'].append(pros)
        result['cons'].append(cons)
        result['rating'].append(rate_list[i])
        
    return pd.DataFrame(result)

In [20]:
%%time
uk_df = tranlate_ru_language_and_filter_empty_data(df)
uk_df.to_csv("rozetka-hard-comments-uk-all.csv")

CPU times: user 48min 37s, sys: 38.6 s, total: 49min 16s
Wall time: 3h 15min 5s


read data from already translated before 

In [23]:
# uk_df = pd.read_csv("rozetka-hard-comments-uk-all.csv")

In [21]:
uk_df = uk_df.fillna("")

In [26]:
uk_df.head()

Unnamed: 0,title,pros,cons,rating
0,Народний проц,12 потоков і розгон,Ціна через вірус,5
1,"Норм проц, взяв до нього в450, працює стабільн...","Охолодження, продуктивність",За 80 $ їх просто немає,5
2,"Топ за свої гроші Новий тех процес 12нм , по ф...","Ціна, новий техпроцес",,5
3,"За свою ціну це просто незамінний проц, краще ...",Ціна Якість Хороший і тихий кулер Продуктивніс...,,5
4,"За ці гроші конкурентів немає, користуюся вже ...",,,5


In [27]:
len(uk_df)

62409

In [28]:
def concatenate_title_pros_cons(df):
    title_list = df['title'].values
    pros_list = df['pros'].values
    cons_list = df['cons'].values
    y = df['rating'].values
    
    X = []
    for i in range(0, len(title_list)):
        text = f"{title_list[i]} {pros_list[i]} {cons_list[i]}"
        X.append(text)
        
    return X, y

In [29]:
def reduce_star_labels(y):
    reduced_y = []
    for yi in y:
        if yi >= 4:
            reduced_y.append('pos')
        elif yi == 3:
            reduced_y.append('neutral')
        else:
            reduced_y.append('neg')
    return reduced_y

In [30]:
X_all, y_all = concatenate_title_pros_cons(uk_df)

In [31]:
y_all = reduce_star_labels(y_all)

In [32]:
classes = ['pos', 'neutral', 'neg']

In [33]:
X_all[:5]

['Народний проц 12 потоков і розгон Ціна через вірус',
 'Норм проц, взяв до нього в450, працює стабільно ,, холодний, на стоковому вентиляторі нормально охолоджується Охолодження, продуктивність За 80 $ їх просто немає',
 "Топ за свої гроші Новий тех процес 12нм , по факту це 2600 Взяв легко частоти 4000 ( під стрестестом аіди стояв 10 хв на одному рівні )легко йшов бі далі але температура вже була не дуже ( у мене боксовий куллер від 3700х ) Пам'ять 3400 теж легко Ціна, новий техпроцес ",
 'За свою ціну це просто незамінний проц, краще не знайдете в цій ціновій категорії, в іграх на ультра настройках не завантажується навіть на 50% Грівся максимум до 75, але тут швидше спасибі поганому продув в корпусі Ціна Якість Хороший і тихий кулер Продуктивність Низькі температури ',
 'За ці гроші конкурентів немає, користуюся вже півмісяця, все влаштовує!  ']

In [34]:
sent_uk_df = pd.DataFrame({
    'text': X_all,
    'label': y_all
})

In [35]:
sent_uk_df.head()

Unnamed: 0,text,label
0,Народний проц 12 потоков і розгон Ціна через в...,pos
1,"Норм проц, взяв до нього в450, працює стабільн...",pos
2,"Топ за свої гроші Новий тех процес 12нм , по ф...",pos
3,"За свою ціну це просто незамінний проц, краще ...",pos
4,"За ці гроші конкурентів немає, користуюся вже ...",pos


## Data analysis

In [36]:
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [37]:
uk_df['rating'].iplot(
    kind='hist',
    bins=5,
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review rating dictribution')

In [38]:
sent_uk_df['label'].iplot(
    kind='hist',
    bins=3,
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Sentiment label distribution')

As we can see from the diagrams above, the users mostly mentioned positive reviews than negative. <br/>
To avoid overfiting, while training model, we will select approximatelly the same ratio for each labels (it means if neutral contains 42 and negative contains 46 we will select 46 of positive reviews). 

## Clear data

In [39]:
import re

In [40]:
def read_stop_words(file):
    with open(file) as f:
        stop_words = f.read().split('\n')

    return stop_words

In [41]:
uk_stop_words = read_stop_words('data/ukrainian-stopwords.txt')

In [42]:
uk_stop_words[:5]

['a', 'б', 'в', 'г', 'е']

In [43]:
def cleaning(text):
    text = re.sub(r"http\S+", "", text) #remove all urls
    text = re.sub(r'[^\w\s]','',text) #remove all punctuations
    text = re.sub(r'[A-z]+','',text) #remove all english words
    text = re.sub(r'[0-9]+','',text) #remove all numbers
    
    return text.strip().lower()

In [44]:
X_all = [cleaning(x) for x in X_all]

In [45]:
cleaning("Я не знаю https://m.rozetka.com.ua/offer/123242339/?gclid=eaiaiqobchmivumitc3u5wivypaych27dwfheaqyasabegiofpd_bwe")

'я не знаю'

In [46]:
X_all[:5]

['народний проц  потоков і розгон ціна через вірус',
 'норм проц взяв до нього в працює стабільно  холодний на стоковому вентиляторі нормально охолоджується охолодження продуктивність за   їх просто немає',
 'топ за свої гроші новий тех процес нм  по факту це  взяв легко частоти   під стрестестом аіди стояв  хв на одному рівні легко йшов бі далі але температура вже була не дуже  у мене боксовий куллер від х  память  теж легко ціна новий техпроцес',
 'за свою ціну це просто незамінний проц краще не знайдете в цій ціновій категорії в іграх на ультра настройках не завантажується навіть на  грівся максимум до  але тут швидше спасибі поганому продув в корпусі ціна якість хороший і тихий кулер продуктивність низькі температури',
 'за ці гроші конкурентів немає користуюся вже півмісяця все влаштовує']

In [48]:
len(X_all)

62409

## Split data

In [49]:
import random
from sklearn.model_selection import train_test_split

In [50]:
def split_for_avoiding_overfitting(X, y, epsilon_percent = 0.2):
    X_y = list(zip(X, y))
    label_Xy_dict = dict()
    for i in range(0, len(X_y)):
        label = X_y[i][1]
        
        if label not in label_Xy_dict:
            label_Xy_dict[label] = [X_y[i]]
        else:
            label_Xy_dict[label].append(X_y[i])
            
    labels = list(label_Xy_dict.keys())
    labels.sort(key = lambda label: len(label_Xy_dict[label]))
    
    prev_len = None
    for label in labels:
        cur_len = len(label_Xy_dict[label])
        if prev_len is None:
            prev_len = cur_len
            continue
        
        if 1 - prev_len / cur_len <= epsilon_percent:
            prev_len = cur_len
            continue
        
        label_Xy_dict[label] = random.sample(label_Xy_dict[label], int((1 + epsilon_percent) * prev_len))
        prev_len = cur_len
        
    
    new_X_y = []
    for label in label_Xy_dict:
        new_X_y += label_Xy_dict[label]
        
    random.shuffle(new_X_y)
    
    res = list(zip(*new_X_y)) 
    X_new = list(res[0])
    y_new = list(res[1])
    
    return X_new, y_new
    

In [51]:
X, y = split_for_avoiding_overfitting(X_all, y_all, 0.5)

In [52]:
new_sent_uk_df = pd.DataFrame({
    'text': X,
    'label': y
})

In [53]:
new_sent_uk_df['label'].iplot(
    kind='hist',
    bins=3,
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Sentiment label distribution')

As we can see from the diagram above, it has approximately uniform distribution to avoid overfitiing for one specific label (e.g. pos)

In [54]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Tokenize function with lematization

In [55]:
import pymorphy2
import re

In [56]:
morph = pymorphy2.MorphAnalyzer(lang="uk")

In [57]:
def tokenize_with_lemma(text: str) -> List[str]:
    words = tokenize_uk.tokenize_words(text)
    return [morph.parse(word)[0].normal_form for word in words]

# Bag of words

In [58]:
import tokenize_uk
import abc
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [59]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import classification_report

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from enum import IntEnum

In [61]:
class BoWMode(IntEnum):
    COUNT_OCCURENCE = 0,
    TF_IDF = 1

In [62]:
class ClassifierType:
    GAUSSIAN_NAIVE_BAYES = 0,
    MULTINOMIAL_NAIVE_BAYES = 1,
    LOGISTIC_REGRESSION = 2

In [63]:
def create_bag_of_words_vectorizer(mode: BoWMode, tokenizer = None, max_features = None, stop_words = None):
    if mode == BoWMode.COUNT_OCCURENCE:
        return CountVectorizer(analyzer = "word", 
                                          tokenizer = tokenizer, 
                                          stop_words = stop_words, 
                                          max_features = max_features,
                                          preprocessor=None)
    
    if mode == BoWMode.TF_IDF:
        return TfidfVectorizer(analyzer = "word", 
                                          tokenizer = tokenizer, 
                                          preprocessor = None, 
                                          stop_words = stop_words, 
                                          max_features = max_features) 
    
    return None

In [64]:
def create_classifier(clf_type: ClassifierType):
    if clf_type == ClassifierType.GAUSSIAN_NAIVE_BAYES:
        return  GaussianNB()
    
    if clf_type == ClassifierType.MULTINOMIAL_NAIVE_BAYES:
        return  MultinomialNB()
    
    if clf_type == ClassifierType.LOGISTIC_REGRESSION:
        return  LogisticRegression(C = 0.1, solver='lbfgs', multi_class='auto',  n_jobs=-1, max_iter=1000)
    
    return None


In [65]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

### Counting word occurrence

#### Counting word occurrence with Naive Bayes

In [66]:
co_m_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.COUNT_OCCURENCE, tokenizer=tokenize_uk.tokenize_words)),
        ('to_dense', DenseTransformer()), 
        ('clf' , create_classifier(ClassifierType.MULTINOMIAL_NAIVE_BAYES))
    ])

In [67]:
co_m_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_words at 0x7f9dd94ccc20>,
                                 vocabulary=None)),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x7f9db5ef5990>),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [68]:
y_pred = co_m_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.66      0.75      0.70       870
     neutral       0.50      0.29      0.36       636
         pos       0.78      0.87      0.82      1331

    accuracy                           0.70      2837
   macro avg       0.65      0.64      0.63      2837
weighted avg       0.68      0.70      0.68      2837



#### Counting word occurrence with Logistig Regression

In [69]:
co_log_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.COUNT_OCCURENCE, tokenizer=tokenize_uk.tokenize_words)),
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [70]:
co_log_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_words at 0x7f9dd94ccc20>,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                      

In [71]:
y_pred = co_log_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.70      0.72      0.71       870
     neutral       0.53      0.33      0.41       636
         pos       0.77      0.89      0.82      1331

    accuracy                           0.71      2837
   macro avg       0.66      0.65      0.65      2837
weighted avg       0.69      0.71      0.69      2837



As se can see from the above confusion matrixes, logistic regression gives us a little bit better result than Naive Bayes (macro av: 0.50 > 0.48)

### TF-IDF

If extremely high frequency may dominate the result and causing model bias. TF-IDF take approach which is believe that high frequency may not able to provide much information gain.

#### TF-IDF with naive bayes

In [72]:
tfidf_g_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.TF_IDF, tokenizer=tokenize_uk.tokenize_words)),
        ('to_dense', DenseTransformer()), 
        ('clf' , create_classifier(ClassifierType.GAUSSIAN_NAIVE_BAYES))
    ])

In [73]:
tfidf_g_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_words at 0x7f9dd94ccc20>,
                                 use_idf=True, vocabulary=None)),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x7f9db3b5c310>),
                ('clf', Gaussian

In [74]:
y_pred = tfidf_g_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.47      0.40      0.43       870
     neutral       0.22      0.48      0.31       636
         pos       0.62      0.34      0.44      1331

    accuracy                           0.39      2837
   macro avg       0.44      0.41      0.39      2837
weighted avg       0.49      0.39      0.41      2837



#### TF-IDF with logistic regression

In [75]:
tfidf_log_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.TF_IDF, tokenizer=tokenize_uk.tokenize_words)),
        ('to_dense', DenseTransformer()), 
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [76]:
tfidf_log_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(...
                                 use_idf=True, vocabulary=None)),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x7f9db3b5c510>),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    f

In [77]:
y_pred = tfidf_log_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.65      0.69      0.67       870
     neutral       0.62      0.06      0.11       636
         pos       0.66      0.92      0.77      1331

    accuracy                           0.66      2837
   macro avg       0.64      0.56      0.52      2837
weighted avg       0.65      0.66      0.59      2837



Here we can also noticed that result using logistic regression is better, but tf-idf gave worse result than simple bag of words (count occurences)

In the experimetns above we testes all words without lemmatization. Let's try to run a few examples with lemmatization and compare the result

### Counting word occurrence with lematization

In [78]:
co_log_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.COUNT_OCCURENCE, tokenizer=tokenize_with_lemma)),
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [79]:
co_log_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_with_lemma at 0x7f9db5ef7170>,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                 

In [80]:
y_pred = co_log_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.69      0.72      0.71       870
     neutral       0.51      0.33      0.40       636
         pos       0.77      0.88      0.82      1331

    accuracy                           0.71      2837
   macro avg       0.66      0.64      0.64      2837
weighted avg       0.69      0.71      0.69      2837



In this case lemmatization imroved the result, which was expected. 

### TF-IDF with lematization

In [81]:
tfidf_log_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.TF_IDF, tokenizer=tokenize_with_lemma)),
        ('to_dense', DenseTransformer()), 
        ('clf' , create_classifier(ClassifierType.GAUSSIAN_NAIVE_BAYES))
    ])

In [82]:
tfidf_log_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_with_lemma at 0x7f9db5ef7170>,
                                 use_idf=True, vocabulary=None)),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x7f9db377c050>),
                ('clf', Gau

In [83]:
y_pred = tfidf_log_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.46      0.30      0.37       870
     neutral       0.22      0.58      0.32       636
         pos       0.57      0.24      0.34      1331

    accuracy                           0.34      2837
   macro avg       0.42      0.37      0.34      2837
weighted avg       0.46      0.34      0.34      2837



In [84]:
tfidf_log_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.TF_IDF, tokenizer=tokenize_with_lemma)),
        ('to_dense', DenseTransformer()), 
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [85]:
tfidf_log_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(...
                                 use_idf=True, vocabulary=None)),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x7f9db3b58710>),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    f

In [86]:
y_pred = tfidf_log_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.65      0.73      0.69       870
     neutral       0.62      0.09      0.16       636
         pos       0.69      0.92      0.79      1331

    accuracy                           0.67      2837
   macro avg       0.65      0.58      0.55      2837
weighted avg       0.66      0.67      0.62      2837



For TF-IDF lemmatization did not significantly improved the result (macro avg: 0.35 > 0.32)

### Bag of words with limited number of features

In [87]:
len(CountVectorizer(analyzer = "word", tokenizer = tokenize_with_lemma).fit(X_train).get_feature_names())

20170

#### Counting word occurences with logistic regression

In [88]:
co_log_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.COUNT_OCCURENCE, tokenizer=tokenize_with_lemma, max_features=1000)),
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [89]:
co_log_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=1000, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_with_lemma at 0x7f9db5ef7170>,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                 

In [90]:
y_pred = co_log_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.67      0.71      0.69       870
     neutral       0.52      0.31      0.39       636
         pos       0.76      0.88      0.81      1331

    accuracy                           0.70      2837
   macro avg       0.65      0.63      0.63      2837
weighted avg       0.68      0.70      0.68      2837



As we can see the result with limited max_feetures a little bit imroved (macro avg 0.56 > 0.53), it can be caused, because there were filter some rare words

#### TF-IDF with Naive Bayes

In [91]:
tfidf_nb_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.TF_IDF, tokenizer=tokenize_uk.tokenize_words, max_features=1000)),
        ('to_dense', DenseTransformer()), 
        ('clf' , create_classifier(ClassifierType.GAUSSIAN_NAIVE_BAYES))
    ])

In [92]:
tfidf_nb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_words at 0x7f9dd94ccc20>,
                                 use_idf=True, vocabulary=None)),
                ('to_dense',
                 <__main__.DenseTransformer object at 0x7f9db18f5a10>),
                ('clf', Gaussian

In [93]:
y_pred = tfidf_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.61      0.67      0.64       870
     neutral       0.41      0.40      0.41       636
         pos       0.78      0.74      0.76      1331

    accuracy                           0.64      2837
   macro avg       0.60      0.60      0.60      2837
weighted avg       0.64      0.64      0.64      2837



Here we also notices a little bi improvement (macro avg 0.51 > 0.45)

Conclussion: we noticed that the best results was givenby Counting word occurrence ( the simple bag of words) model with logistig regression classifier using lemmatization with limited max_features=1000. The result was f1-score (macro avg) =  0.56. So let's focus on this model fo further improvement

## Bag of words based with filtering stop words

In [94]:
bow_pipeline = Pipeline([
        ('vect', create_bag_of_words_vectorizer(BoWMode.COUNT_OCCURENCE, 
                                                tokenizer=tokenize_with_lemma, 
                                                stop_words=  uk_stop_words,
                                                max_features=1000)),
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [95]:
bow_pipeline.fit(X_train, y_train)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['бувати', 'буда', 'булий', 'бута', 'всей', 'відсоток', 'говорити', 'дар', 'друга', 'дякувати', 'зайняти', 'зараза', 'значити', 'йога', 'казати', 'кожний', 'кома', 'круг', 'ласка', 'мата', 'мен', 'мир', 'мільйон', 'небути', 'нікола', 'нікуда', 'перти', 'початок', 'рана', 'раніший', 'рок', 'ріка', 'самий', 'свій', 'сей', 'соб', 'справити', 'терти', 'тисяча', 'тога', 'том', 'увесь', 'частіший', 'числення', 'чома', 'їсти'] not in stop_words.



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=1000, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['a', 'б', 'в', 'г', 'е', 'ж', 'з',
                                             'м', 'т', 'у', 'я', 'є', 'і', 'аж',
                                             'ви', 'де', 'до', 'за', 'зі', 'ми'...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_with_lemma at 0x7f9db5ef7170>,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None

In [96]:
y_pred = bow_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.67      0.70      0.68       870
     neutral       0.51      0.30      0.38       636
         pos       0.76      0.89      0.82      1331

    accuracy                           0.70      2837
   macro avg       0.65      0.63      0.63      2837
weighted avg       0.68      0.70      0.68      2837



# Bag of words based on bi-gram

In [97]:
def tokenize_with_lemma_and_stop_words(text: str):
    lemma_result = tokenize_with_lemma(text)
    result = []
    for word in lemma_result:
        if word in uk_stop_words:
            continue
        
        if len(word) < 2:
            continue
        result.append(word)
    
    return result

In [98]:
def create_count_vectorizer_on_bi_gram():
    count_vectorizer = CountVectorizer(analyzer = "word", 
                                       tokenizer = tokenize_with_lemma_and_stop_words, 
                                          max_features = 1000,
                                          ngram_range=(2, 2))
    return count_vectorizer

In [99]:
bigram_bow_pipeline = Pipeline([
        ('vect',create_count_vectorizer_on_bi_gram()),
        ('clf' , create_classifier(ClassifierType.LOGISTIC_REGRESSION))
    ])

In [100]:
bigram_bow_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=1000, min_df=1,
                                 ngram_range=(2, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_with_lemma_and_stop_words at 0x7f9db13b3d40>,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
  

In [101]:
y_pred = bigram_bow_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.61      0.41      0.49       870
     neutral       0.41      0.05      0.09       636
         pos       0.57      0.93      0.71      1331

    accuracy                           0.57      2837
   macro avg       0.53      0.46      0.43      2837
weighted avg       0.55      0.57      0.50      2837



As we can see that result with bi-grams is worse, so we do not consider bi-gram in our bag of words model for futher improvement. 

# Bag of words with multiple features

In current dataset we have 3 seperate feutures for user review comment: title, pros and cons. In the previous experiments, we concatenate these features into one. So, let's try to use separtion for this features in bag of words model. 

### Prepare data 

In [102]:
uk_df

Unnamed: 0,title,pros,cons,rating
0,Народний проц,12 потоков і розгон,Ціна через вірус,5
1,"Норм проц, взяв до нього в450, працює стабільн...","Охолодження, продуктивність",За 80 $ їх просто немає,5
2,"Топ за свої гроші Новий тех процес 12нм , по ф...","Ціна, новий техпроцес",,5
3,"За свою ціну це просто незамінний проц, краще ...",Ціна Якість Хороший і тихий кулер Продуктивніс...,,5
4,"За ці гроші конкурентів немає, користуюся вже ...",,,5
...,...,...,...,...
62404,"Включилася відразу, нарікань немає. Брав виклю...",,,5
62405,"Вчора о 9 ранку зробив замовлення, сьогодні в ...",,,5
62406,Потрібно повернути товар. Чи не стала в нагоді...,,,5
62407,Як перехідник на dvi / hdmi зійде. Для ігор не...,,,2


In [103]:
X_3_all = uk_df[['title', 'pros', 'cons']].to_numpy()
y_all = reduce_star_labels(uk_df['rating'].values)

In [104]:
for i in range(0, len(X_3_all)):
    X_3_all[i][0] = cleaning(X_3_all[i][0])
    X_3_all[i][1] = cleaning(X_3_all[i][1])
    X_3_all[i][2] = cleaning(X_3_all[i][2])

In [105]:
X_3, y_3 = split_for_avoiding_overfitting(X_3_all, y_all, 0.5)

In [106]:
len(X_3)

14183

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

In [108]:
class ThreeFeatureBagOfWords:
    def __init__(self):
        self.count_vectorizer_title = CountVectorizer(analyzer = "word", 
                                       tokenizer = tokenize_with_lemma, 
                                          max_features = 1000,
                                         stop_words=uk_stop_words)
        
        self.count_vectorizer_pros = CountVectorizer(analyzer = "word", 
                               tokenizer = tokenize_with_lemma, 
                                  max_features = 500,
                                 stop_words=uk_stop_words)
        
        self.count_vectorizer_cons = CountVectorizer(analyzer = "word", 
                               tokenizer = tokenize_with_lemma, 
                                  max_features = 500,
                                 stop_words=uk_stop_words)

        
    def fit(self, raw_documents, y):
        title_list = [raw_document[0] for raw_document in raw_documents]
        self.count_vectorizer_title.fit(title_list)
        
        pros_list = [raw_document[1] for raw_document in raw_documents]
        self.count_vectorizer_pros.fit(pros_list)
        
        cons_list = [raw_document[2] for raw_document in raw_documents]
        self.count_vectorizer_cons.fit(cons_list)
        
        return self
    
    
    def transform(self, raw_documents):
        result = []
                
        for raw_document in raw_documents:
            title_vector = self.count_vectorizer_title.transform([raw_document[0]]).toarray().tolist()[0]
            pros_vector = self.count_vectorizer_pros.transform([raw_document[1]]).toarray().tolist()[0]
            cons_vector = self.count_vectorizer_cons.transform([raw_document[2]]).toarray().tolist()[0]
            
#             print('###############################################')
#             print('title_vector', title_vector)
#             print('###############################################')
#             print('pros_vector', pros_vector)
#             print('###############################################')
#             print('cons_vector', cons_vector)
#             print('###############################################')

            item  = title_vector + pros_vector + cons_vector
#             print(item)
            
            result.append(item)
            
        return np.array(result)


In [109]:
pipeline =  Pipeline([
        ('vect', ThreeFeatureBagOfWords()),
        ('clf' , LogisticRegression(C = 0.01, solver='newton-cg', multi_class='auto',  n_jobs=-1, max_iter=10000))
    ]) 

In [110]:
pipeline.fit(X_train, y_train)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['бувати', 'буда', 'булий', 'бута', 'всей', 'відсоток', 'говорити', 'дар', 'друга', 'дякувати', 'зайняти', 'зараза', 'значити', 'йога', 'казати', 'кожний', 'кома', 'круг', 'ласка', 'мата', 'мен', 'мир', 'мільйон', 'небути', 'нікола', 'нікуда', 'перти', 'початок', 'рана', 'раніший', 'рок', 'ріка', 'самий', 'свій', 'сей', 'соб', 'справити', 'терти', 'тисяча', 'тога', 'том', 'увесь', 'частіший', 'числення', 'чома', 'їсти'] not in stop_words.


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['бувати', 'буда', 'булий', 'бута', 'всей', 'відсоток', 'говорити', 'дар', 'друга', 'дякувати', 'зайняти', 'зараза', 'значити', 'йога', 'казати', 'кожний', 'кома', 'круг', 'ласка', 'мата', 'мен', 'мир', 'мільйон', 'небути', 'нікола', 'нікуда', 'перти', 'початок', 'рана', 'раніший', 'рок', 'ріка', 'самий', 'свій', 'сей', 'соб', 'справити', 'терт

Pipeline(memory=None,
         steps=[('vect',
                 <__main__.ThreeFeatureBagOfWords object at 0x7f9db16a3390>),
                ('clf',
                 LogisticRegression(C=0.01, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=10000,
                                    multi_class='auto', n_jobs=-1, penalty='l2',
                                    random_state=None, solver='newton-cg',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [111]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.69      0.65      0.67       874
     neutral       0.52      0.14      0.22       661
         pos       0.66      0.93      0.77      1302

    accuracy                           0.66      2837
   macro avg       0.62      0.57      0.55      2837
weighted avg       0.64      0.66      0.61      2837



In [112]:
y_pred_all = pipeline.predict(X_3_all)

In [113]:
print(classification_report(y_all, y_pred_all))

              precision    recall  f1-score   support

         neg       0.50      0.67      0.57      4389
     neutral       0.29      0.20      0.24      3211
         pos       0.95      0.94      0.94     54809

    accuracy                           0.88     62409
   macro avg       0.58      0.60      0.58     62409
weighted avg       0.88      0.88      0.88     62409

