In [1]:
import json
import os
import pprint
from math import log
from collections import Counter
from random import shuffle
from langdetect import detect
import tokenize_uk
import pymorphy2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [2]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [55]:
with open('./rozetka_all.json') as f:
    reviews = json.load(f)

In [56]:
all_reviews = [r for r in reviews if r \
               and (r['text'].strip() and r['rating'] \
                or r.get('pros', '').strip() or r.get('cons', '').strip())]

In [None]:
ukr_reviews = []
for review in all_reviews:
    text = review['text']
    pros = review.get('pros')
    cons = review.get('cons')

    try:
        t_lang = detect(text)
        p_lang = detect(text) if pros else t_lang
        c_lang = detect(text) if cons else t_lang
        if t_lang == 'uk' and p_lang == 'uk' and c_lang == 'uk':
            ukr_reviews.append(review)
    except Exception:
        pass

In [None]:
# needed once
# with open('./rozetka_uk.json', 'w', encoding='utf-8') as f:
#     f.write(json.dumps(ukr_reviews, ensure_ascii=False))

In [3]:
# DEBUG
with open('./rozetka_uk.json') as f:
    ukr_reviews = json.load(f)

In [61]:
class NaiveBayesClassifier:
    def __init__(self, text_processor):
        self.text_processor = text_processor

    def get_features(self):
        feature_words = []

        for review in self.X_data:
            processed = self.text_processor(review['text'])

            for word in processed:
                if word not in feature_words:
                    feature_words.append(word)

        features = {x: i for i, x in enumerate(feature_words)}
        features.update({'UNK': len(features)})

        self.features = features

        return features

    def get_senses(self):
        pos = []
        neg = []
        neut = []

        for review in self.X_data:
            text = self.text_processor(review['text'])
            sens = review['sens']

            if sens == 'neut':
                for t in text:

                    word = t
                    neut.append(word)
            elif sens == 'pos':
                for t in text:
                    word = t
                    pos.append(word)
            else:
                for t in text:
                    word = t
                    neg.append(word)
        return pos, neg, neut

    def get_feature_counts_by_class(self):
        pos, neg, neut = self.get_senses()
        features = self.get_features()

        count = {'pos': [], 'neg': [], 'neut': []}
        cnt_pos = Counter(pos)
        cnt_neg = Counter(neg)
        cnt_neut = Counter(neut)

        for w in features.keys():
            pos_c = cnt_pos[w]
            neg_c = cnt_neg[w]
            neut_c = cnt_neut[w]
            count['pos'].append(pos_c)
            count['neg'].append(neg_c)
            count['neut'].append(neut_c)
        return count

    def get_feature_weights_by_class(self):
        res = {}
        feat_counts = self.get_feature_counts_by_class()

        for k, v in feat_counts.items():
            res[k] = [log(x/len(v)) if x else log(0.1/len(v)) for x in v]
        return res

    def calculate_bias_by_class(self):
        pos = 0
        neg = 0
        neut = 0
        all_count = len(self.X_data)

        for review in self.X_data:
            sens = review['sens']

            if sens == 'neut':
                neut += 1
            elif sens == 'pos':
                pos += 1
            else:
                neg += 1

        return {'pos': round(log(pos/all_count), 4),
                'neg': round(log(neg/all_count), 4),
                'neut': round(log(neut/all_count), 4)
                }

    def predict_class(self, text, weights, bias):
        text_words = self.text_processor(text)
        features = self.features

        p_pos = bias['pos'] + sum(weights['pos'][features.get(
            word, features['UNK'])] for word in text_words)
        p_neg = bias['neg'] + sum(weights['neg'][features.get(
            word, features['UNK'])] for word in text_words)
        p_neut = bias['neut'] + sum(weights['neut'][features.get(
            word, features['UNK'])] for word in text_words)

        p_dict = {'pos': p_pos, 'neg': p_neg, 'neut': p_neut}

        return max(p_dict, key=p_dict.get)

    def fit(self, X_data):
        self.X_data = X_data
        self.bias = self.calculate_bias_by_class()
        self.weights = self.get_feature_weights_by_class()

    def predict(self, y_data):
        res = []

        for review in y_data:
            text = review['text']
            pros = review.get('pros', '')
            cons = review.get('cons', '')
            all_text = text + pros + cons
            res.append(self.predict_class(all_text, self.weights, self.bias))
        return res


def get_corpus(reviews):
    res = []
    for review in reviews:
        rating = review['rating']
        text = review['text']
        pros = review.get('pros')
        cons = review.get('cons')

        rev_text = {'text': text}
        rev_pros = {'text': pros, 'sens': 'pos'} if pros else None
        rev_cons = {'text': cons, 'sens': 'neg'} if cons else None

        if rating:
            if rating == 5:
                rev_text['sens'] = 'pos'
            elif rating >= 3:
                rev_text['sens'] = 'neut'
            else:
                rev_text['sens'] = 'neg'
            res.append(rev_text)

        if rev_pros:
            res.append(rev_pros)

        if rev_cons:
            res.append(rev_cons)

    pos = [x for x in res if x['sens'] == 'pos']
    neg = [x for x in res if x['sens'] == 'neg']
    neut = [x for x in res if x['sens'] == 'neut']
    min_len = min(len(pos), len(neg), len(neut))
    pos_f = pos[:min_len]
    neg_f = neg[:min_len]
    neut_f = neut[:min_len]

    res_normalized = pos_f + neg_f + neut_f
    print('Lenght of each cohort:', min_len)
    shuffle(res_normalized)
    return res_normalized


def divide_data(data):
    data_len = int(len(data) * 0.7)
    return data[:data_len], data[data_len:]


""" text processors START """


def tokenize_text(text):
    return tokenize_uk.tokenize_uk.tokenize_words(text)


def lowerize_text(text):
    return [word.lower() for word in tokenize_text(text)]


def _lemmatize(text):
    res = []
    for word in tokenize_text(text):
        m_word = morph.parse(word)[0]
        res.append((m_word.normal_form, m_word))
    return res


def lemmatize_text(text):
    return [x for x, _ in _lemmatize(text)]


def filterize_text(text):
    res = []
    lemmatized = _lemmatize(text)

    symbols = ['-', '+', ':', '<', '>', '&']
    invalid_pos = ['CONJ', 'INTJ', 'PREP', 'NPRO']
    invalid_non_oc_pos = ['NUMB,intg', 'NUMB,real', 'ROMN', 'PNCT', 'LATN']

    for word, m_word in lemmatized:
        if len(word) and str(m_word.tag) not in invalid_non_oc_pos and \
                m_word.tag.POS not in invalid_pos and \
                word not in symbols:
            res.append(word)

    return res


def negatiaze_text(text):
    res = []
    words = filterize_text(text)

    for i, word in enumerate(words):
        p = morph.parse(word)[0]
        if (p.tag.POS == 'ADJF' or p.tag.POS == 'VERB' or p.tag.POS == 'INFN') \
                and words[i-1] == 'не':
            res.append(f'не_{word}')
        else:
            res.append(word)

    return res


def filterize_text_q(text):
    res = []
    if text.endswith('?'):
        return []
    return negatiaze_text(text)


def ngrammaze_text(text, additional_preproc=None):
    if additional_preproc:
        words = ' '.join(additional_preproc(text))
    else:
        words = text
    return [words[i:i + 3] if i > 0 else '^' + words[i:i + 3] for i in range(0, len(words), 1)]


""" text processors END """


def get_X(reviews):
    return [x['text'] for x in reviews]


def get_y(reviews):
    return [x['sens'] for x in reviews]


def get_classification_report(preprocess_fn, train_data, test_data, y_target):
    cls = NaiveBayesClassifier(preprocess_fn)
    cls.fit(train_data)
    test_predict = cls.predict(test_data)
    print(classification_report(y_target, test_predict))


def get_cross_validation_report(preprocess_fn, X_train, y_train, X_test, y_target):
    vect = CountVectorizer(tokenizer=preprocess_fn)
    cls_1 = Pipeline([('vect', vect), ('cls', MultinomialNB())])
    cls_1.fit(X_train, y_train)
    cls_1.predict
    scoring = {'accuracy': make_scorer(accuracy_score),
               'precision': make_scorer(precision_score, average='macro'),
               'recall': make_scorer(recall_score, average='macro'),
               'f1_macro': make_scorer(f1_score, average='macro'),
               'f1_weighted': make_scorer(f1_score, average='weighted')}
    res = cross_validate(cls_1, X_test, y_target, return_train_score=True)
    pp = pprint.PrettyPrinter(indent=4, compact=True)
    pp.pprint(res)

In [5]:
corpus = get_corpus(ukr_reviews)

Length of each cohort: 1080


In [6]:
train_data, test_data = divide_data(corpus)

In [7]:
X_train = get_X(train_data)
X_test = get_X(test_data)
y_train = get_y(train_data)
y_target = get_y(test_data)

In [8]:
get_classification_report(tokenize_text)

              precision    recall  f1-score   support

         neg       0.91      0.34      0.50       326
        neut       0.41      0.98      0.57       315
         pos       0.81      0.22      0.35       331

    accuracy                           0.51       972
   macro avg       0.71      0.51      0.47       972
weighted avg       0.71      0.51      0.47       972



In [24]:
get_cross_validation_report(tokenize_text)

{   'fit_time': array([0.03127098, 0.02997279, 0.03269792, 0.03125978, 0.03050375]),
    'score_time': array([0.00904202, 0.00991797, 0.00869703, 0.01004219, 0.01065016]),
    'test_accuracy': array([0.62564103, 0.68717949, 0.56185567, 0.58247423, 0.61340206]),
    'test_f1_macro': array([0.62737363, 0.67610681, 0.5542328 , 0.58058608, 0.61049818]),
    'test_f1_weighted': array([0.62737601, 0.67640747, 0.55340779, 0.57989691, 0.61025254]),
    'test_precision': array([0.70160727, 0.74371629, 0.59736145, 0.63728938, 0.72633053]),
    'test_recall': array([0.62890813, 0.6895899 , 0.56510157, 0.58553669, 0.61697192])}


In [13]:
get_classification_report(lowerize_text)

              precision    recall  f1-score   support

         neg       0.92      0.36      0.52       326
        neut       0.40      0.97      0.56       315
         pos       0.75      0.18      0.28       331

    accuracy                           0.49       972
   macro avg       0.69      0.50      0.46       972
weighted avg       0.69      0.49      0.45       972



In [25]:
get_cross_validation_report(lowerize_text)

{   'fit_time': array([0.03264093, 0.03215694, 0.03589129, 0.03449416, 0.03251028]),
    'score_time': array([0.00977206, 0.01093817, 0.00927997, 0.01086593, 0.01020694]),
    'test_accuracy': array([0.65641026, 0.66666667, 0.57731959, 0.60309278, 0.59278351]),
    'test_f1_macro': array([0.65861142, 0.6521153 , 0.57005177, 0.60366023, 0.58953623]),
    'test_f1_weighted': array([0.65873464, 0.65236794, 0.56937127, 0.60289281, 0.58921748]),
    'test_precision': array([0.7202701 , 0.74380872, 0.62855841, 0.63705853, 0.68900585]),
    'test_recall': array([0.65945166, 0.66923016, 0.58081178, 0.60564621, 0.5959707 ])}


In [30]:
get_classification_report(lemmatize_text)

              precision    recall  f1-score   support

         neg       0.94      0.36      0.52       326
        neut       0.39      0.99      0.56       315
         pos       0.81      0.14      0.24       331

    accuracy                           0.49       972
   macro avg       0.71      0.50      0.44       972
weighted avg       0.72      0.49      0.44       972



In [31]:
get_cross_validation_report(lemmatize_text)

{   'fit_time': array([1.47227502, 1.50447416, 1.57949591, 1.44217491, 1.43997693]),
    'score_time': array([0.35878015, 0.41877699, 0.29701304, 0.407794  , 0.40071225]),
    'test_accuracy': array([0.69230769, 0.65641026, 0.61340206, 0.59793814, 0.64948454]),
    'test_f1_macro': array([0.69847591, 0.64840349, 0.60594441, 0.6030658 , 0.64863941]),
    'test_f1_weighted': array([0.69922169, 0.64838907, 0.60515513, 0.60295841, 0.64883207]),
    'test_precision': array([0.74565286, 0.73095238, 0.65353813, 0.64513187, 0.71585567]),
    'test_recall': array([0.69432419, 0.65910739, 0.61687942, 0.5998742 , 0.65103785])}


In [32]:
get_classification_report(filterize_text)

              precision    recall  f1-score   support

         neg       0.94      0.37      0.53       326
        neut       0.41      0.97      0.57       315
         pos       0.83      0.24      0.38       331

    accuracy                           0.52       972
   macro avg       0.72      0.53      0.49       972
weighted avg       0.73      0.52      0.49       972



In [33]:
get_cross_validation_report(filterize_text)

{   'fit_time': array([1.57367492, 1.50916982, 1.59950614, 1.48378277, 1.44924498]),
    'score_time': array([0.35865712, 0.42318201, 0.30774689, 0.38333893, 0.40724301]),
    'test_accuracy': array([0.6974359 , 0.68717949, 0.58762887, 0.62886598, 0.65979381]),
    'test_f1_macro': array([0.70146877, 0.67970203, 0.58510331, 0.63496668, 0.66175363]),
    'test_f1_weighted': array([0.70220932, 0.67911327, 0.58468232, 0.63551453, 0.66208518]),
    'test_precision': array([0.72576119, 0.72463261, 0.60462448, 0.65869299, 0.69563724]),
    'test_recall': array([0.6986532 , 0.68878562, 0.58962519, 0.62944833, 0.66032486])}


In [34]:
get_classification_report(negatiaze_text)

              precision    recall  f1-score   support

         neg       0.93      0.43      0.58       326
        neut       0.42      0.97      0.59       315
         pos       0.82      0.24      0.37       331

    accuracy                           0.54       972
   macro avg       0.72      0.55      0.51       972
weighted avg       0.73      0.54      0.51       972



In [35]:
get_cross_validation_report(negatiaze_text)

{   'fit_time': array([2.62964416, 2.51945972, 2.69767189, 2.54449725, 2.60344887]),
    'score_time': array([0.5982008 , 0.74097037, 0.51896906, 0.68202972, 0.71885109]),
    'test_accuracy': array([0.7025641 , 0.69230769, 0.63402062, 0.62886598, 0.67010309]),
    'test_f1_macro': array([0.70715409, 0.68512428, 0.63190089, 0.63364405, 0.67314222]),
    'test_f1_weighted': array([0.70791464, 0.68448238, 0.63140784, 0.63407864, 0.67367853]),
    'test_precision': array([0.73195539, 0.7181338 , 0.65098145, 0.65975469, 0.70936149]),
    'test_recall': array([0.7037037 , 0.69391383, 0.63618974, 0.62977393, 0.67050357])}


In [36]:
get_classification_report(filterize_text_q)

              precision    recall  f1-score   support

         neg       0.92      0.44      0.60       326
        neut       0.43      0.97      0.59       315
         pos       0.83      0.24      0.38       331

    accuracy                           0.55       972
   macro avg       0.73      0.55      0.52       972
weighted avg       0.73      0.55      0.52       972



In [37]:
get_cross_validation_report(filterize_text_q)

{   'fit_time': array([2.64487386, 2.43449402, 2.63043094, 2.52366281, 2.50279093]),
    'score_time': array([0.58888984, 0.72793293, 0.52039123, 0.6685791 , 0.69508004]),
    'test_accuracy': array([0.70769231, 0.67692308, 0.62371134, 0.63917526, 0.65979381]),
    'test_f1_macro': array([0.71225899, 0.67184372, 0.62254615, 0.64317665, 0.66276134]),
    'test_f1_weighted': array([0.71317409, 0.67140836, 0.62207051, 0.64362264, 0.66335936]),
    'test_precision': array([0.73216447, 0.70529334, 0.63796959, 0.66586662, 0.70138017]),
    'test_recall': array([0.70827321, 0.67772493, 0.62560773, 0.63987494, 0.66008436])}


In [42]:
get_classification_report(lambda x: ngrammaze_text(x, lowerize_text))

              precision    recall  f1-score   support

         neg       0.89      0.38      0.53       326
        neut       0.40      1.00      0.57       315
         pos       0.82      0.12      0.21       331

    accuracy                           0.49       972
   macro avg       0.70      0.50      0.44       972
weighted avg       0.71      0.49      0.44       972



In [43]:
get_cross_validation_report(lambda x: ngrammaze_text(x, lowerize_text))

{   'fit_time': array([0.08786774, 0.08607125, 0.09123993, 0.08525681, 0.08237219]),
    'score_time': array([0.02169704, 0.0260098 , 0.02031374, 0.0247829 , 0.02431417]),
    'test_accuracy': array([0.67179487, 0.66153846, 0.67525773, 0.64948454, 0.65979381]),
    'test_f1_macro': array([0.67307976, 0.65487451, 0.67322107, 0.65391523, 0.65991654]),
    'test_f1_weighted': array([0.67331337, 0.65412287, 0.67262047, 0.65382265, 0.65994816]),
    'test_precision': array([0.69669988, 0.67592593, 0.68595952, 0.67169162, 0.6933808 ]),
    'test_recall': array([0.67364117, 0.66421616, 0.67715248, 0.65060495, 0.66096866])}


In [44]:
get_classification_report(lambda x: ngrammaze_text(x, lemmatize_text))

              precision    recall  f1-score   support

         neg       0.89      0.37      0.52       326
        neut       0.39      1.00      0.56       315
         pos       0.86      0.09      0.17       331

    accuracy                           0.48       972
   macro avg       0.72      0.49      0.42       972
weighted avg       0.72      0.48      0.41       972



In [45]:
get_cross_validation_report(lambda x: ngrammaze_text(x, lemmatize_text))

{   'fit_time': array([1.56223011, 1.4749372 , 1.60342288, 1.51111603, 1.48056793]),
    'score_time': array([0.3701551 , 0.43318677, 0.31703329, 0.39748788, 0.42143583]),
    'test_accuracy': array([0.68717949, 0.6974359 , 0.68556701, 0.61340206, 0.67525773]),
    'test_f1_macro': array([0.69005721, 0.69372439, 0.68320226, 0.61746566, 0.6772963 ]),
    'test_f1_weighted': array([0.69055473, 0.6935486 , 0.68283533, 0.61727617, 0.67749621]),
    'test_precision': array([0.71137351, 0.71355813, 0.70406273, 0.64838136, 0.70696794]),
    'test_recall': array([0.68855219, 0.69918539, 0.68773449, 0.61510342, 0.67635328])}


In [46]:
get_classification_report(lambda x: ngrammaze_text(x, filterize_text))

              precision    recall  f1-score   support

         neg       0.95      0.30      0.46       326
        neut       0.37      1.00      0.54       315
         pos       0.86      0.06      0.11       331

    accuracy                           0.44       972
   macro avg       0.73      0.45      0.37       972
weighted avg       0.73      0.44      0.37       972



In [47]:
get_cross_validation_report(lambda x: ngrammaze_text(x, filterize_text))

{   'fit_time': array([1.59539604, 1.47582912, 1.58996987, 1.50018907, 1.51606607]),
    'score_time': array([0.35130596, 0.44019294, 0.31457806, 0.40962577, 0.43612194]),
    'test_accuracy': array([0.64615385, 0.68717949, 0.6443299 , 0.6185567 , 0.67525773]),
    'test_f1_macro': array([0.64652442, 0.68515896, 0.64483218, 0.62213076, 0.67654041]),
    'test_f1_weighted': array([0.64681345, 0.68513503, 0.64452926, 0.62226185, 0.67688957]),
    'test_precision': array([0.65070644, 0.70793871, 0.6542624 , 0.63674524, 0.70091324]),
    'test_recall': array([0.64670515, 0.6882875 , 0.64540645, 0.61934732, 0.67515078])}


In [48]:
get_classification_report(lambda x: ngrammaze_text(x, filterize_text_q))

              precision    recall  f1-score   support

         neg       0.95      0.30      0.45       326
        neut       0.37      1.00      0.54       315
         pos       0.87      0.06      0.11       331

    accuracy                           0.44       972
   macro avg       0.73      0.45      0.37       972
weighted avg       0.74      0.44      0.37       972



In [49]:
get_cross_validation_report(lambda x: ngrammaze_text(x, filterize_text_q))

{   'fit_time': array([2.63989401, 2.51032233, 2.7345171 , 2.55634475, 2.54128098]),
    'score_time': array([0.59502196, 0.72793388, 0.54447985, 0.69707012, 0.73798299]),
    'test_accuracy': array([0.66666667, 0.67692308, 0.66494845, 0.62886598, 0.67525773]),
    'test_f1_macro': array([0.66874698, 0.67664768, 0.66498165, 0.63200937, 0.67717524]),
    'test_f1_weighted': array([0.66946274, 0.67670002, 0.66453685, 0.63231631, 0.67764922]),
    'test_precision': array([0.6748158 , 0.697263  , 0.67251884, 0.64078624, 0.70504148]),
    'test_recall': array([0.66666667, 0.67754269, 0.66624487, 0.62912273, 0.67515078])}


In [50]:
get_classification_report(lambda x: ngrammaze_text(x, negatiaze_text))

              precision    recall  f1-score   support

         neg       0.95      0.30      0.45       326
        neut       0.37      1.00      0.54       315
         pos       0.86      0.06      0.11       331

    accuracy                           0.44       972
   macro avg       0.73      0.45      0.37       972
weighted avg       0.73      0.44      0.36       972



In [51]:
get_cross_validation_report(lambda x: ngrammaze_text(x, negatiaze_text))

{   'fit_time': array([2.72987318, 2.55923009, 2.76168394, 2.63325906, 2.57849503]),
    'score_time': array([0.62254691, 0.73774409, 0.52852488, 0.724365  , 0.71830297]),
    'test_accuracy': array([0.66153846, 0.68717949, 0.65979381, 0.62371134, 0.67010309]),
    'test_f1_macro': array([0.66237509, 0.68549109, 0.66021975, 0.62683532, 0.6724804 ]),
    'test_f1_weighted': array([0.66282558, 0.68541464, 0.65989767, 0.62700816, 0.67285759]),
    'test_precision': array([0.66878283, 0.71094218, 0.66818983, 0.63701997, 0.70065551]),
    'test_recall': array([0.66209716, 0.6882875 , 0.66095386, 0.62431272, 0.67010027])}


In [52]:
get_classification_report(ngrammaze_text)

              precision    recall  f1-score   support

         neg       0.89      0.41      0.57       326
        neut       0.41      1.00      0.58       315
         pos       0.80      0.15      0.25       331

    accuracy                           0.51       972
   macro avg       0.70      0.52      0.47       972
weighted avg       0.71      0.51      0.46       972



In [53]:
get_cross_validation_report(ngrammaze_text)

{   'fit_time': array([0.06954122, 0.06615901, 0.07011938, 0.06650329, 0.06421614]),
    'score_time': array([0.01671195, 0.02056694, 0.01598072, 0.01860404, 0.01903796]),
    'test_accuracy': array([0.66153846, 0.64615385, 0.67525773, 0.6443299 , 0.62886598]),
    'test_f1_macro': array([0.66294789, 0.63373402, 0.67370659, 0.64511197, 0.62674173]),
    'test_f1_weighted': array([0.66330093, 0.63274974, 0.67322756, 0.64468967, 0.62647224]),
    'test_precision': array([0.68305041, 0.66315525, 0.69547157, 0.67935587, 0.67596331]),
    'test_recall': array([0.66305916, 0.64962611, 0.67747068, 0.64676435, 0.63115403])}
