In [1]:
import requests
import json

def products_url(cat_id, page=1):
    return 'https://xl-catalog-api.rozetka.com.ua/v2/goods/get?front-type=xl&category_id={}&page={}&sort=rank'.format(cat_id, page)

def comments_url(product_id, page=1):
    return 'https://product-api.rozetka.com.ua/v3/comments/get?front-type=xl&goods={}&page={}&sort=date&limit=10'.format(product_id, page)

def parse_comments(d):
    for comment in d['data']['comments']:
        yield {
            'mark': comment['mark'],
            'text': comment['text'],
            'pros': comment['dignity'],
            'cons': comment['shortcomings']
        }

def parse_product_comments(product_id):
    first_page_url = comments_url(product_id)
    resp = requests.get(first_page_url)
    body = json.loads(resp.text)
    pages_num = body['data']['pages']['count']

    yield from parse_comments(body)

    for page in range(2, pages_num + 1):
        url = comments_url(product_id, page)
        resp = requests.get(url)
        body = json.loads(resp.text)
        yield from parse_comments(body)

def parse_products(d):
    for product_id in d['data']['ids']:
        yield from parse_product_comments(product_id)

def parse_category(cat_id):
    first_page_url = products_url(cat_id)
    resp = requests.get(first_page_url)
    body = json.loads(resp.text)
    pages_num = body['data']['total_pages']

    yield from parse_products(body)

    for page in range(2, pages_num + 1):
        url = products_url(cat_id, page)
        resp = requests.get(url)
        body = json.loads(resp.text)
        yield from parse_products(body)

In [2]:
tvs_cat = 80037
smart_boxes_cat = 80015
tv_remotes_cat = 80070
tuners_cat = 165692
home_theatres_cat = 84535
av_receivers_cat = 283322

next(parse_category(tvs_cat))

{'mark': None, 'text': 'Як повернути товар?', 'pros': '', 'cons': ''}

In [3]:
import time
import itertools

def drain_gracefully(gen):
    return list(map(lambda x: (time.sleep(0.2), x)[1], gen))

tvs_comments = parse_category(tvs_cat)
sb_comments = parse_category(smart_boxes_cat)
tv_remotes_comments = parse_category(tv_remotes_cat)
tuners_comments = parse_category(tuners_cat) 
home_theatres_comments = parse_category(home_theatres_cat)
av_receivers_comments = parse_category(av_receivers_cat)

all_comments = itertools.chain(tvs_comments, sb_comments, tv_remotes_comments, tuners_comments, home_theatres_comments, av_receivers_comments)

# go to sleep now
all_comments = drain_gracefully(all_comments)
len(all_comments)

63146

In [4]:
with open('all-comments.json', 'w') as f:
    json.dump(all_comments, f)

In [5]:
import re
def sanitize_comments(comments):
    for c in all_comments:
        if c['text']:
            c['text'] = re.sub('<br\s*/?>', '', c['text'])
        if c['pros']:
            c['pros'] = re.sub('<br\s*/?>', '', c['pros'])
        if c['cons']:
            c['cons'] = re.sub('<br\s*/?>', '', c['cons'])

sanitize_comments(all_comments)

In [6]:
import cld2
def is_ukrainian(text):
    is_reliable, _, details = cld2.detect(text)
    return is_reliable and details[0][0] == 'UKRAINIAN'

print(is_ukrainian('я люблю котів'))
print(is_ukrainian('я люблю котов'))

True
False


In [7]:
def nullify_non_ukrainian(comments):
    for c in comments:
        if not is_ukrainian(c['text']):
            c['text'] = None
        if not is_ukrainian(c['pros']):
            c['pros'] = None
        if not is_ukrainian(c['cons']):
            c['cons'] = None

nullify_non_ukrainian(all_comments)

In [8]:
import stanza
nlp = stanza.Pipeline(lang='uk')

2020-04-10 00:43:27 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |
| pos       | iu      |
| lemma     | iu      |
| depparse  | iu      |

2020-04-10 00:43:27 INFO: Use device: cpu
2020-04-10 00:43:27 INFO: Loading: tokenize
2020-04-10 00:43:27 INFO: Loading: mwt
2020-04-10 00:43:27 INFO: Loading: pos
2020-04-10 00:43:28 INFO: Loading: lemma
2020-04-10 00:43:28 INFO: Loading: depparse
2020-04-10 00:43:29 INFO: Done loading processors!


In [9]:
def enrich_comments(comments):
    for c in comments:
        if c['text']:
            c['text_model'] = nlp(c['text'])
        else:
            c['text_model'] = None
            c['text'] = None
        
        if c['pros']:
            c['pros_model'] = nlp(c['pros'])
        else:
            c['pros_model'] = None
            c['pros'] = None

        if c['cons']:
            c['cons_model'] = nlp(c['cons'])
        else:
            c['cons_model'] = None
            c['cons'] = None

enrich_comments(all_comments)

In [10]:
cons_stop_regexps = [
    re.compile('відсутні'),
    re.compile('ніяких'),
    re.compile('\w*вияв\w+'), # невиявлені
    re.compile('нема\w?'), # нема, немає
    re.compile('\w*знай[шд]\w+'), # незнайшов, незнайдені
    re.compile('\w*бачи[вл]\w*'), # непобачив, небачив
    re.compile('\w*поміти[вл]\w*'),
    re.compile('недолік\w*'),
    re.compile('невідом[іо]')
]

def is_relevant_pros_cons(model):
    words = [w for sent in model.sentences for w in sent.words]
    has_adj_adv = False
    for sent in model.sentences:
        for w in sent.words:
            if w.upos in ['ADJ', 'ADV']:
                has_adj_adv = True
            if any([re.match(r, w.text) for r in cons_stop_regexps]):
                return False
            if w.text in ['має', "знаю"] and not w.id == '1' and sent.words[int(w.id) - 2] == 'не':
                return False
    return has_adj_adv
    
print(is_relevant_pros_cons(nlp('-')))
print(is_relevant_pros_cons(nlp('невиявлені.')))
print(is_relevant_pros_cons(nlp('не виявлені.')))
print(is_relevant_pros_cons(nlp('поки що не виявлені.')))
print(is_relevant_pros_cons(nlp('Поки що не знайшов.')))
print(is_relevant_pros_cons(nlp('Поки не виявлено, стоїть поки на декілька годин відвгрівається після доставки.')))
print(is_relevant_pros_cons(nlp('немає')))
print(is_relevant_pros_cons(nlp('не має')))
print(is_relevant_pros_cons(nlp('Поки не побачив')))
print(is_relevant_pros_cons(nlp('пульт, ціна')))
print(is_relevant_pros_cons(nlp('недоліки не виявлені.')))
print(is_relevant_pros_cons(nlp('погано працює.')))
print(is_relevant_pros_cons(nlp('поганий пульт.')))

False
False
False
False
False
False
False
False
False
False
False
True
True


In [11]:
def has_question(doc):
    for sent in doc.sentences:
        for word in sent.words:
            if word.lemma == '?':
                return True
    return False

has_question(nlp('який виробник?'))

True

In [55]:
def mark_to_sentiment(mark):
    if mark <= 2:
        return 'negative'
    if mark == 3:
        return 'neutral'
    if mark >= 4:
        return 'positive'

def prepare_dataset(comments):
    seen = set()
    X = []
    y = []
    
    for c in comments:
        # filter obvious junk
        if c['pros'] and c['cons'] and c['pros'] == c['cons']:
            continue
        
        if (c['text_model'] and 
                # filter potential duplicates
                not c['text'] in seen and
            
                # filter comments with 3 stars.
                # in most cases, 3-stars comments are clearly positive 
                # or clearly negative, but not neutral. we will drop 
                # them just not to confuse a classifier.
                c['mark'] and (not c['mark'] == 3) and 
            
                # if comment contains question - drop it right away
                (not has_question(c['text_model']))):
            seen.add(c['text'])
            X.append(c['text_model'])
            y.append(mark_to_sentiment(c['mark']))
            
        if (c['pros_model'] and 
                not c['pros'] in seen and
                is_relevant_pros_cons(c['pros_model']) and 
                (not has_question(c['pros_model']))):
            seen.add(c['pros'])
            X.append(c['pros_model'])
            y.append('positive')
            
        if (c['cons_model'] and 
                not c['cons'] in seen and
                is_relevant_pros_cons(c['cons_model']) and
                (not has_question(c['cons_model']))):
            seen.add(c['cons'])
            X.append(c['cons_model'])
            y.append('negative')
            
    return (X, y)    

In [13]:
X, y = prepare_dataset(all_comments)
len(X)

7449

In [14]:
from sklearn.model_selection import train_test_split
import collections

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(collections.Counter(y_train))
print(collections.Counter(y_test))

Counter({'positive': 4272, 'negative': 1314})
Counter({'positive': 1408, 'negative': 455})


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

def make_classifier(params, vect=None):
    if vect == None:
        vect = CountVectorizer()
    classifier = Pipeline([('vect', vect),
                           ('nb', MultinomialNB())])
    final_params = {'nb__alpha': 0.001,
                    'vect__lowercase': False}
    final_params.update(params)
    classifier.set_params(**final_params)

    return classifier

In [16]:
def tokenize(doc):
    return [w.text.lower() for sent in doc.sentences for w in sent.words]

tokenize(nlp('Я люблю котів.'))

['я', 'люблю', 'котів', '.']

In [17]:
from sklearn.metrics import classification_report

# Baseline
tok_classifier = make_classifier({'vect__tokenizer': tokenize})
tok_classifier.fit(X_train, y_train)
print(classification_report(y_test, tok_classifier.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.75      0.40      0.52       455
    positive       0.83      0.96      0.89      1408

    accuracy                           0.82      1863
   macro avg       0.79      0.68      0.70      1863
weighted avg       0.81      0.82      0.80      1863



In [18]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, f1_score, precision_score

cv_scoring = {'recall_pos': make_scorer(recall_score, average = None, labels = ['positive']), 
              'recall_neg': make_scorer(recall_score, average = None, labels = ['negative']),
              'precision_pos': make_scorer(precision_score, average = None, labels = ['positive']),
              'precision_neg': make_scorer(precision_score, average = None, labels = ['negative']),
              'f1_pos': make_scorer(f1_score, average = None, labels = ['positive']),
              'f1_neg': make_scorer(f1_score, average = None, labels = ['negative'])
             }

def cross_validation_report(clf, X=X_train, y=y_train):
    results = cross_validate(clf, X_train, y_train, scoring=cv_scoring)
    
    def calc(arr):
        mean = arr.mean()
        dev= arr.std() * 2
        
        return "%0.2f (+/- %0.2f)" % (mean, dev)
    
    print("positive")
    print("\tprecision:\t{}".format(calc(results['test_precision_pos'])))
    print("\trecall:\t\t{}".format(calc(results['test_recall_pos'])))
    print("\tf1:\t\t{}".format(calc(results['test_f1_pos'])))
    print('')
    print("negative")
    print("\tprecision:\t{}".format(calc(results['test_precision_neg'])))
    print("\trecall:\t\t{}".format(calc(results['test_recall_neg'])))
    print("\tf1:\t\t{}".format(calc(results['test_f1_neg'])))

In [19]:
def lemmatize(doc):
    return [w.lemma for sent in doc.sentences for w in sent.words]

lemmatize(nlp('Я люблю котів!'))

['я', 'любити', 'кіт', '!']

In [20]:
lem_classifier = make_classifier({'vect__tokenizer': lemmatize})
cross_validation_report(lem_classifier)

positive
	precision:	0.85 (+/- 0.01)
	recall:		0.95 (+/- 0.01)
	f1:		0.90 (+/- 0.01)

negative
	precision:	0.75 (+/- 0.04)
	recall:		0.45 (+/- 0.04)
	f1:		0.56 (+/- 0.04)


In [21]:
negatives_to_glue = [
    'без',
    'не',
    'ні',
    'ані'
]

def lemmatize_and_glue_negatives(doc):
    glued = []
    
    for sent in doc.sentences:
        i = 0
        words = sent.words
        while i < len(words):
            if ((words[i].lemma in negatives_to_glue) and
                    not i == len(words) - 1 and
                    not words[i + 1].upos == 'PUNCT'):
                new_word = words[i].lemma + '_' + words[i + 1].lemma
                glued.append(new_word)
                i += 1
            else:
                glued.append(words[i].lemma)
            i += 1

    return glued

lemmatize_and_glue_negatives(nlp('не рекомендую цього кота. без нього життя було набагато краще.'))

['не_рекомендувати',
 'цей',
 'кіт',
 '.',
 'без_він',
 'життя',
 'бути',
 'набагато',
 'краще',
 '.']

In [22]:
gluing_classifier = make_classifier({'vect__tokenizer': lemmatize_and_glue_negatives})
cross_validation_report(gluing_classifier)

positive
	precision:	0.86 (+/- 0.00)
	recall:		0.96 (+/- 0.01)
	f1:		0.91 (+/- 0.00)

negative
	precision:	0.79 (+/- 0.03)
	recall:		0.49 (+/- 0.01)
	f1:		0.61 (+/- 0.01)


In [41]:
class CountVectorizerOov:
    def __init__(self, **kwargs):
        self.vectorizer = CountVectorizer(token_pattern=None)
        self.set_params(**kwargs)
        
    def set_params(self, **kwargs):
        self.tokenizer = kwargs.get('tokenizer')
        kwargs.update({'tokenizer': lambda x: x})
        self.vectorizer.set_params(**kwargs)
        return self
    
    def fit(self, X, y=None):
        X_tokenized = [self.tokenizer(x) for x in X]
        self.vectorizer.fit(X_tokenized)
        self.vectorizer.vocabulary_['<<<OOV>>>'] = len(self.vectorizer.vocabulary_)
        
    def fit_transform(self, X, y=None):
        X_tokenized = [self.tokenizer(x) for x in X]
        self.vectorizer.fit(X_tokenized)
        self.vectorizer.vocabulary_['<<<OOV>>>'] = len(self.vectorizer.vocabulary_)
        return self.vectorizer.transform(X_tokenized)
    
    def get_feature_names(self):
        return self.vectorizer.get_feature_names()
    
    def transform(self, X):
        feats = set(self.vectorizer.get_feature_names())
        
        new_X = []
        for x in X:
            new_x = []
            for tok in self.tokenizer(x):
                if not tok in feats:
                    new_x.append('<<<OOV>>>')
                else:
                    new_x.append(tok)
            new_X.append(new_x)
        
        return self.vectorizer.transform(new_X)           

In [45]:
vect_oov = CountVectorizerOov(tokenizer=lemmatize, lowercase=False)
print(vect_oov.fit_transform([nlp('я люблю котів')]).toarray())
print(vect_oov.get_feature_names())
print(vect_oov.transform([nlp('я люблю мишей')]).toarray())

[[1 1 1 0]]
['кіт', 'любити', 'я', '<<<OOV>>>']
[[0 1 1 1]]


In [24]:
gluing_classifier_oov = make_classifier({'vect__tokenizer': lemmatize_and_glue_negatives}, vect=CountVectorizerOov())
cross_validation_report(gluing_classifier_oov)

positive
	precision:	0.88 (+/- 0.01)
	recall:		0.94 (+/- 0.01)
	f1:		0.91 (+/- 0.00)

negative
	precision:	0.75 (+/- 0.02)
	recall:		0.59 (+/- 0.04)
	f1:		0.66 (+/- 0.02)


In [68]:
gluing_classifier_oov_tuned = make_classifier({'vect__tokenizer': lemmatize_and_glue_negatives, 
                                               'nb__alpha': 0.5, 
                                               'nb__fit_prior': False,
                                               }, 
                                              vect=CountVectorizerOov())
cross_validation_report(gluing_classifier_oov_tuned)

positive
	precision:	0.94 (+/- 0.01)
	recall:		0.90 (+/- 0.03)
	f1:		0.92 (+/- 0.01)

negative
	precision:	0.72 (+/- 0.05)
	recall:		0.80 (+/- 0.05)
	f1:		0.76 (+/- 0.03)


In [69]:
gluing_classifier_oov_tuned.fit(X_train, y_train)
print(classification_report(y_test, gluing_classifier_oov_tuned.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.71      0.79      0.75       455
    positive       0.93      0.90      0.91      1408

    accuracy                           0.87      1863
   macro avg       0.82      0.84      0.83      1863
weighted avg       0.88      0.87      0.87      1863



Things that I also tried:
* bigrams - better precision, but much lower recall - f1 score is worse;
* tonal dict:
  * appending tone markers to the tokenized sentence, so sentence like this "мені подобається бити котів" will be tokenized into "я подобатися бити кіт <<positive\>\> <<very negative\>\>", because "подобається" and "бити" have tone score 1 and -2 respectively. result - no improvement, f1 score remained the same.
  * replacing words with their tone markers: "мені подобається бити котів" will be tokenized into "я <<positive\>\> <<very negative\>\> кіт". result - worse precision and recall.