In [102]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def read_text(name_txt):
  with open(name_txt, 'r') as file:
    text = file.readlines()
  return ' '.join(text)

In [205]:
data = pd.read_csv('encoded-museums.csv', delimiter='\\t', engine='python')

In [206]:
data.head()

Unnamed: 0,"""text","stars"""
0,"""Надзвичайно містичне та історичне місце , чуд...","5"""
1,"""Цікаве місце. Нам дуже пощастило поспілкувати...","5"""
2,"""В цьому музеї ви не проведете багато часу. В ...","5"""
3,"""Не музей. Просто стара хата. Навіть частина. ...","4"""
4,"""Надзвичайно містичне та історичне місце , чуд...","5"""


In [207]:
data = data.dropna()
train_data = data['"text']
train_label = data['stars"']
train_x = [sentence[1:] for sentence in train_data]
Y = [sentence[:-1] for sentence in train_label]
train_y = []
for mark in Y:
  if mark in ['5']:
    train_y.append('positive')
  else:
    train_y.append('negative')


In [208]:
def tokenize(data):
  tokens = []
  for line in data:
    tokens.append(nltk.word_tokenize(line.lower()))
  return tokens

train_x = tokenize(train_x)

In [209]:
def remove_punctuation(data):
  punctuation = [',', '.', '/', '\\', '|', '\'', '\'\'', '\"', '«', '»', '-', '—', '%', '`', '``', '(', ')', '!', '?', ':', '’'] 
  tokens_without_punct = []
  for token_sentence in data:
    tok = [token for token in token_sentence if token not in punctuation]
    tokens_without_punct.append(tok)
  return tokens_without_punct

train_x = remove_punctuation(train_x)

In [210]:
def remove_stopwords(data):
  uk_stop_words = set(read_text('stopwords-uk.txt').split())
  tokens_without_sw = []
  for token_sentence in data:
    tok = [token for token in token_sentence if token not in uk_stop_words]
    tokens_without_sw.append(tok)
  return tokens_without_sw

train_x = remove_stopwords(train_x)

In [211]:
def list_to_string(data):
  list_text = []
  for text in data:
    list_text.append(' '.join(text))
  return list_text

train_x = list_to_string(train_x)

In [212]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [213]:
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size = .75, stratify=train_y)

In [214]:
abc_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', AdaBoostClassifier())])
rfc_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', RandomForestClassifier())])
gbc_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', GradientBoostingClassifier())])

In [215]:
abc_ppl_clf.fit(train_x, train_y)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('sgd_clf', AdaBoostClassifier())])

In [217]:
rfc_ppl_clf.fit(train_x, train_y)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('knb_clf', RandomForestClassifier())])

In [218]:
gbc_ppl_clf.fit(train_x, train_y)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('sgd_clf', GradientBoostingClassifier())])

In [216]:
predicted_abc = abc_ppl_clf.predict(test_x)
print(metrics.classification_report(predicted_abc, test_y))

              precision    recall  f1-score   support

    negative       0.22      0.63      0.33       131
    positive       0.96      0.81      0.88      1511

    accuracy                           0.79      1642
   macro avg       0.59      0.72      0.60      1642
weighted avg       0.90      0.79      0.83      1642



In [219]:
predicted_rfc = rfc_ppl_clf.predict(test_x)
print(metrics.classification_report(predicted_rfc, test_y))

              precision    recall  f1-score   support

    negative       0.26      0.68      0.38       145
    positive       0.96      0.82      0.88      1497

    accuracy                           0.80      1642
   macro avg       0.61      0.75      0.63      1642
weighted avg       0.90      0.80      0.84      1642



In [220]:
predicted_gbc = gbc_ppl_clf.predict(test_x)
print(metrics.classification_report(predicted_gbc, test_y))

              precision    recall  f1-score   support

    negative       0.17      0.75      0.27        83
    positive       0.98      0.80      0.88      1559

    accuracy                           0.80      1642
   macro avg       0.58      0.77      0.58      1642
weighted avg       0.94      0.80      0.85      1642

