In [4]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [5]:
train = pd.read_csv("../data/ukrainian_reviews_corpus.csv", sep='|')
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42
data_train, data_val, labels_train, labels_val = \
    train_test_split(train['opinion_text'], train['opinion_rating'],
                     test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED, stratify=train['opinion_rating'])\


In [6]:
train.head()

Unnamed: 0,opinion_rating,opinion_text
0,0,"хороше місце щоб провести вечір, кальяни смачн..."
1,0,взуттям задоволена. беру не перший раз. мінус...
2,0,"відмінний магазин та чемний продавець, єдиним ..."
3,0,"замовляв дві штуки, а відправили 1"
4,0,замовляю вже втретє в цьому магазині і планую ...


In [7]:
train['opinion_rating'].value_counts()

0    15182
1     9380
Name: opinion_rating, dtype: int64

In [13]:
pipeline_lr = Pipeline([('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(1,7), max_features=1000000)),
                     ('svm', LogisticRegression(C = 50))])

In [14]:
pipeline_lr.fit(data_train, labels_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf=True...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [15]:
y_pred = pipeline_lr.predict(data_val)

In [16]:
print(classification_report(labels_val, y_pred))

             precision    recall  f1-score   support

          0       0.95      0.95      0.95      1519
          1       0.93      0.92      0.93       938

avg / total       0.94      0.94      0.94      2457



In [17]:
f1_score(labels_val, y_pred)

0.92529348986125926