In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import csv
import pymorphy2

morph = pymorphy2.MorphAnalyzer(lang='uk')

In [2]:
df_comments = pd.read_csv('comments.csv', usecols=[0,1,3,4,5,6,7,8,9,10])
df_comments.head()

Unnamed: 0,category_name,comment_id,dignity,is_from_buyer,product_name,mark,percent_dignity,replies_number,shortcomings,text
0,Мінеральна вода,45534917,,True,Упаковка минеральной лечебно-столовой сильнога...,5.0,0,0,,"Все класно, тільки довго везуть. Привезли чере..."
1,Мінеральна вода,45503604,,False,Упаковка минеральной лечебно-столовой сильнога...,0.0,100,0,,Тара стекло или пэт-пластик?
2,Мінеральна вода,45412834,,True,Упаковка минеральной лечебно-столовой сильнога...,5.0,0,1,,Рекомендую
3,Мінеральна вода,41870818,,True,Упаковка минеральной лечебно-столовой сильнога...,5.0,100,1,,Да водичка супер!
4,Мінеральна вода,41169471,,True,Упаковка минеральной лечебно-столовой сильнога...,5.0,100,1,,Подскажите это оригинальная вода ???


In [3]:
#drop rows with no marks
df_comments = df_comments[df_comments.mark != 0.0]
df_comments = df_comments[df_comments['mark'].notna()]
# drop rows with no text
df_comments = df_comments.dropna(subset=['text', 'dignity', 'shortcomings'], how='all').reset_index(drop=True)
df_comments.drop_duplicates(inplace = True)
df_comments['mark'].value_counts()

5.0    24514
4.0     5794
3.0     2106
2.0     1256
1.0     1202
Name: mark, dtype: int64

###### Detect language for every comment 

In [5]:
def detect_lang (text):
    if text and re.search(r'[А-я]+', text):
        return detect(text)

df_comments['lang'] = df_comments['text'].fillna('').apply(detect_lang)
df_comments['lang'].value_counts()

ru    27024
uk     5314
bg      500
mk      386
en        4
it        2
fr        1
ca        1
et        1
de        1
af        1
ro        1
Name: lang, dtype: int64

###### Leave comments in Ukrainian

In [6]:
df_comments = df_comments[df_comments.lang == 'uk'].reset_index(drop=True)
df_comments['mark'].value_counts()

5.0    3626
4.0     979
3.0     350
2.0     181
1.0     178
Name: mark, dtype: int64

###### Add label to every comment

In [7]:
def choose_label (label):
    if label:
        if 5 >= label >= 4:
            return 'positive'
        else: return 'negative'

df_comments['label'] = df_comments['mark'].apply(choose_label)
df_comments['label'].value_counts()

positive    4605
negative     709
Name: label, dtype: int64

In [9]:
def clean_text (text):
    return BeautifulSoup(text, 'lxml').get_text().lower()

# make downsampling 

# Shuffle the Dataset.
shuffled_df = df_comments.sample(frac=1,random_state=4)
shuffled_df

neg_comments = shuffled_df.loc[shuffled_df['label'] == 'negative']
pos_comments = shuffled_df.loc[shuffled_df['label'] == 'positive'].sample(n=709,random_state=42)

# Concatenate both dataframes again
normalized_df = pd.concat([pos_comments, neg_comments])
normalized_df['clean_text'] = normalized_df['text'].apply(clean_text)
normalized_df['tokens'] = normalized_df['text'].apply(word_tokenize)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(normalized_df['clean_text'], normalized_df['label'],
                                                    test_size = 0.3, random_state = 42,
                                                    stratify = normalized_df['label'])

### 1. Baseline 

In [11]:
vectorizer = CountVectorizer()
train_vec = vectorizer.fit_transform(X_train).toarray()

test_vec = vectorizer.transform(X_test).toarray()

In [12]:
lrc = LogisticRegression(solver='sag', max_iter = 1000, random_state=42)
lrc.fit(train_vec, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
predicted_y = lrc.predict(test_vec)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

    negative       0.76      0.74      0.75       213
    positive       0.75      0.77      0.76       213

    accuracy                           0.75       426
   macro avg       0.75      0.75      0.75       426
weighted avg       0.75      0.75      0.75       426



### 2. Improved decision with lemmas, bi-grams

In [16]:
def get_lemmas (doc):
    doc_lemmas = []
    for text in doc:
        sent_lemmas = []
        for token in word_tokenize(text):
            lemma = morph.parse(token)[0].normal_form
            sent_lemmas.append(lemma)
        sent = " ".join(sent_lemmas)
        doc_lemmas.append(sent)
    return doc_lemmas

X_train_lemmas = get_lemmas(X_train)
X_test_lemmas = get_lemmas(X_test)
        
vectorizer = CountVectorizer(analyzer = 'char', ngram_range = (1,5))

train_vec = vectorizer.fit_transform(X_train_lemmas).toarray()

test_vec = vectorizer.transform(X_test_lemmas).toarray()

In [17]:
lrc = LogisticRegression(solver='sag', max_iter = 1000, random_state=42)
lrc.fit(train_vec, y_train)

predicted_y = lrc.predict(test_vec)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

    negative       0.78      0.73      0.75       213
    positive       0.75      0.79      0.77       213

    accuracy                           0.76       426
   macro avg       0.76      0.76      0.76       426
weighted avg       0.76      0.76      0.76       426





### 3. Improved decision with lemmas, bi-grams + negations, count pos/neg words 

In [18]:
# positive/negative words that are not in tone-dict, but used in -alco domain

negative_nouns = {'шмурдяк', 'пойло', 'пійло', 'бурда', 'бодяга', 'бадяга', 'моча', 'лайно'}
negative_adjs = {'незбалансований'}
positive_nouns = {'агонь', 'вогонь', 'топ', 'топчік'}
positive_adjs = {'насичений', 'ароматний', 'запашний', 'духм\'яний', 'тягучий'}

with open('tone-dict-uk.tsv', "r") as f:
    f = csv.reader(f, delimiter="\t", quotechar='"')
    for word in f:
        
        token = word[0]
        label = word[1]
        pos_tag = morph.parse(token)[0].tag
        
        if (label == '-2' or label == '-1') and 'ADJF' in pos_tag:
            negative_adjs.add(token)
        elif (label == '-2' or label == '-1') and 'NOUN' in pos_tag:
            negative_nouns.add(token)
        elif (label == '2' or label == '1') and 'ADJF' in pos_tag:
            positive_adjs.add(token)
        elif (label == '2' or label == '1') and 'NOUN' in pos_tag:
            positive_nouns.add(token)

In [20]:
def prepare_data (data):
    "Returns data with processed negations and counts the freq of negative and positive tokens is text"
    data_w_negation = []
    sentiment = []
    
    # process negation in text
    for sent in data:
        sent = re.sub(r'(\bне) (\w+)', r'\1_\2', sent)
        data_w_negation.append(sent)

    # count negative/positive tokens in every comment
        tokenized_text = sent.split()
        count_pos = 0
        count_neg = 0

        for token in tokenized_text:
            if token in positive_adjs or token in positive_nouns:
                count_pos += 1
            if token in negative_adjs or token in negative_nouns:
                count_neg += 1

        sentiment.append([count_pos, count_neg])
    sentiment = np.array(sentiment)
        
    return data_w_negation, sentiment
    

X_train, train_sentiment = prepare_data(X_train_lemmas)
X_test, test_sentiment = prepare_data(X_test_lemmas)


vectorizer = CountVectorizer(analyzer = 'char', ngram_range = (1,5))
train_vec = vectorizer.fit_transform(X_train).toarray()
test_vec = vectorizer.transform(X_test).toarray()

# add sentiment count to vector
train_vec = np.hstack((train_vec, train_sentiment))
test_vec = np.hstack((test_vec, test_sentiment))

In [21]:
lrc = LogisticRegression(solver='sag', max_iter = 1000, random_state=42)
lrc.fit(train_vec, y_train)

predicted_y = lrc.predict(test_vec)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

    negative       0.81      0.77      0.79       213
    positive       0.78      0.82      0.80       213

    accuracy                           0.79       426
   macro avg       0.79      0.79      0.79       426
weighted avg       0.79      0.79      0.79       426





### Cross-validation 

In [22]:
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(solver = 'sag', random_state=42)

scores = cross_val_score(log_reg, train_vec, y_train, cv=5, scoring='f1_macro', n_jobs = -1)
sum(scores) / len(scores)

0.7801286899942095

### Feature extraction

In [23]:
verbs = {'сподобатися', 'подобатися', 'купувати', 'купити'} 
properties = {'смак', 'колір', 'запах', 'аромат', 'консистенція', 'якість', 'пляшка', 'бутилка'
              'нотка', 'нота', 'післясмак', 'ціна', 'вигляд', 'пляшка'}
drink_name = {'напій', 'вино', 'винчік', 'винішко', 'винцо', 'лікер', 'віскі', 'коньяк', 'коньчок',
              'водка', 'горілка', 'ром', 'джин', 'текіла', 'самбука', 'грапа', 'граппа', 'кальвадос',
              'пиво', 'пивас', 'пивасік', 'тонік', 'товар', 'шампанське', 
              'шампунь', 'шампусік'}
neg_verbs = {'зкурвитися', 'погіршитися'}

def has_verb_negation (text):
    for i in range(len(text)):
        if morph.parse(text[i])[0].normal_form in verbs and text[i-1] == 'не':
            return True
    return False
        
def count_pos_words (text):
    count = 0
    for i in range(len(text)):
        normal_form = morph.parse(text[i])[0].normal_form
        if (normal_form in positive_adjs or normal_form in positive_nouns) and text[i-1] != 'не':
            count += 1        
    return count

def count_neg_words (text):
    count = 0
    for i in range(len(text)):
        normal_form = morph.parse(text[i])[0].normal_form
        if (normal_form in negative_adjs or normal_form in negative_nouns) and text[i-1] != 'не':
            count += 1        
    return count

def has_neg_property (text):
    for i in range(len(text)):
        if morph.parse(text[i])[0].normal_form in properties and morph.parse(text[i-1])[0].normal_form in negative_adjs:
            return True
    return False

def is_good_drink (text):
    for i in range(len(text)):
        if morph.parse(text[i])[0].normal_form in drink_name \
        and morph.parse(text[i-1])[0].normal_form in positive_adjs:
            return True
    return False

In [24]:
features = pd.DataFrame()
features['count_neg_words'] = normalized_df['tokens'].apply(count_neg_words)
features['count_pos_words'] = normalized_df['tokens'].apply(count_pos_words)
features['has_verb_negation'] = normalized_df['tokens'].apply(has_verb_negation)
features['has_neg_property'] = normalized_df['tokens'].apply(has_neg_property)
features['is_good_drink'] = normalized_df['tokens'].apply(is_good_drink)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(features, normalized_df['label'],
                                                    test_size = 0.3, random_state = 42,
                                                    stratify = normalized_df['label'])

lrc1 = LogisticRegression(random_state=42, solver='sag')
lrc1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
predicted_y = lrc1.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

    negative       0.64      0.68      0.66       213
    positive       0.66      0.61      0.63       213

    accuracy                           0.65       426
   macro avg       0.65      0.65      0.65       426
weighted avg       0.65      0.65      0.65       426

