In [119]:
import re
import warnings
warnings.filterwarnings('ignore')
import pickle

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.linear_model import (
    LogisticRegression
)
from sklearn.model_selection import (
    cross_val_score,
    RandomizedSearchCV,
    train_test_split
)
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score
)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer
)
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier
)
from sklearn.svm import (
    SVC,
    LinearSVC
)
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier
)
from sklearn.naive_bayes import (
    GaussianNB
)
from sklearn.neural_network import (
    MLPClassifier
)
from sklearn.neighbors import (
    KNeighborsClassifier
)

import pymystem3


In [2]:
df = pd.read_json("train_yandex_market.json", lines=True)
df.head(5)

Unnamed: 0,text,rating
0,"несколько месяцев Смартфон, действительно очен...",5
1,"менее месяца яркость экрана, 5000 мач, удобно ...",5
2,"менее месяца Удобно лежит в руке, отличные фот...",5
3,"менее месяца Обычный смартфон, хорошее качеств...",4
4,несколько месяцев Хорошая связь. 4g_3g мобильн...,4


In [3]:
X = df.text
y = df.rating

In [4]:
y13_45 = y.map(lambda x: 1 if x > 3 else 0)
y14_5 = y.map(lambda x: 1 if x > 4 else 0)

In [5]:
print(y13_45.value_counts(), end="\n\n")
print(y14_5.value_counts())

1    54057
0     9431
Name: rating, dtype: int64

1    46100
0    17388
Name: rating, dtype: int64


In [6]:
regex_spec_char = re.compile(r"\W")
regex_multi_spaces = re.compile(r"\s+")
regex_digits = re.compile(r"\d+")

In [102]:
# Convert everything to lowercase
X_proc = X.map(lambda sen: sen.lower())

# Remove special characters
X_proc = X_proc.map(lambda sen: regex_spec_char.sub(r" ", sen))

# Remove all digits
X_proc = X_proc.map(lambda sen: regex_digits.sub(r" ", sen))

# Convert all spaces to one
X_proc = X_proc.map(lambda sen: regex_multi_spaces.sub(r" ", sen))

X_proc.head(10)

0    несколько месяцев смартфон действительно очень...
1    менее месяца яркость экрана мач удобно лежит в...
2    менее месяца удобно лежит в руке отличные фото...
3    менее месяца обычный смартфон хорошее качество...
4    несколько месяцев хорошая связь g_ g мобильная...
5    несколько месяцев быстрый хорошая камера не на...
6    менее месяца отличный экран приятный современн...
7    несколько месяцев в комплекте есть прозрачный ...
8    несколько месяцев телефон супер нет отличный т...
9    несколько месяцев экран супер герц очень радуе...
Name: text, dtype: object

In [8]:
%%time
snow_stemmer = SnowballStemmer("russian")
lem = pymystem3.Mystem()

X_stem = X_proc.map(lambda sentence: ' '.join(snow_stemmer.stem(word) for word in nltk.word_tokenize(sentence)))
X_lem = X_proc.map(lambda sentence: ''.join(lem.lemmatize(sentence)))

CPU times: user 3min 1s, sys: 1.21 s, total: 3min 2s
Wall time: 5min 15s


In [103]:
X_stem.head(5)

0    нескольк месяц смартфон действительн очен шуст...
1    мен месяц яркост экра мач удобн леж в рук огро...
2    мен месяц удобн леж в рук отличн фот за сво де...
3    мен месяц обычн смартфон хорош качеств сборк э...
4    нескольк месяц хорош связ g_ g мобильн связ бе...
Name: text, dtype: object

In [104]:
X_lem.head(5)

0    несколько месяц смартфон действительно очень ш...
1    менее месяц яркость экран мач удобно лежать в ...
2    мало месяц удобно лежать в рука отличный фото ...
3    менее месяц обычный смартфон хороший качество ...
4    несколько месяц хороший связь g_ g мобильный с...
Name: text, dtype: object

In [9]:
custom_stopwords = ['во', 'что', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да',
                    'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'еще',
                    'о', 'из', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'него', 'до',
                    'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может',
                    'они', 'тут', 'где', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб',
                    'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того',
                    'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем',
                    'чтобы', 'нее', 'сейчас', 'куда', 'всех', 'можно', 'при', 'наконец', 'два', 'об', 'другой',
                    'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая',
                    'разве', 'три', 'эту', 'моя', 'впрочем', 'свою', 'этой', 'перед', 'иногда', 'чуть', 'том', 'такой', 'им', 'всегда', 'всю', 'между']

In [61]:
parameters_grid_vectorizer_bow = {
    'vectorizer__max_df' : [0.8, 0.9, 1.0],
    'vectorizer__analyzer': ('word', 'char', 'char_wb'),
    'vectorizer__min_df' : [1, 2, 3], 
    'vectorizer__ngram_range' : [(1, 1), (1, 2), (2, 2), (2, 3)],
    'vectorizer__stop_words' : [custom_stopwords, None],
    'vectorizer__binary': (False, True),
    'vectorizer__lowercase': (False, True)
}

parameters_grid_vectorizer_tfidf = {
    'vectorizer__max_df' : [0.8, 0.9, 1.0],
    'vectorizer__analyzer': ('word', 'char', 'char_wb'),
    'vectorizer__min_df' : [1, 2, 3], 
    'vectorizer__ngram_range' : [(1, 1), (1, 2), (2, 2), (2, 3)],
    'vectorizer__stop_words' : [custom_stopwords, None],
    'vectorizer__use_idf': (False, True),
    'vectorizer__binary': (False, True),
    'vectorizer__smooth_idf': (False, True),
    'vectorizer__sublinear_tf': (False, True),
}


parameters_grid_sgd = {
    'classifier__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
    'classifier__penalty': ('l2', 'l1', 'elasticnet'),
    'classifier__class_weight': ('balanced', None)   
}
parameters_grid_svc = {
    'classifier__C': (0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4),
    'classifier__kernel': ('linear', 'poly', 'rbf'),
    'classifier__degree': (2, 3, 4, 5, 6),
    'classifier__tol': (1e-4, 1e-5),
    'classifier__gamma': ('scale', 'auto'),
    'classifier__class_weight': ('balanced', None),
}
parameters_grid_linsvc = {
    'classifier__penalty': ('l1', 'l2'),
    'classifier__loss': ('hinge', 'squared_hinge'),
    'classifier__C': (0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4),
    'classifier__multi_class': ('ovr', 'crammer_singer'),
    'classifier__intercept_scaling': (1.0, 1.1, 1.2, 1.3),
    'classifier__class_weight': ('balanced', None)   
}
parameters_grid_randforest = {
    'classifier__n_estimators': (50, 100, 200),
    'classifier__criterion': ('gini', 'entropy'),
    'classifier__max_features': ('auto', 'sqrt', 'log2'),
    'classifier__class_weight': ('balanced', 'balanced_subsample', None)
}
parameters_grid_logreg = {
    'classifier__C' : (0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, ), 
    'classifier__max_iter' : (50, 100, 150, 200, 250, 300), 
    'classifier__solver' : ('lbfgs', 'liblinear'), 
    'classifier__penalty' : (None, 'l2', 'l1'), 
}

In [11]:
def make_pipeline(vectorizer, classifier):
    return Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])


def make_estimator(vectorizer, classifier, params_grid, scorer, data, labels):
    pipeline = make_pipeline(vectorizer, classifier)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring=scorer, cv=5, random_state=42, n_iter=100)
    grid_cv.fit(data, labels)
    return grid_cv

In [62]:
X_test_stem = X_stem[:6000]
X_test_lem = X_lem[:6000]

y_test_13_45 = y13_45[:6000]
y_test_14_5 = y14_5[:6000]

## Log. Reg.

In [122]:
%%time
log_reg_tfidf_stem_13_45 = make_estimator(
    TfidfVectorizer(),
    LogisticRegression(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_logreg},
    'accuracy',
    X_test_stem,
    y_test_13_45
)
log_reg_tfidf_stem_13_45.best_score_

CPU times: user 5min 55s, sys: 17 s, total: 6min 12s
Wall time: 5min 21s


0.9123333333333333

In [123]:
%%time
log_reg_tfidf_stem_14_5 = make_estimator(
    TfidfVectorizer(),
    LogisticRegression(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_logreg},
    'accuracy',
    X_test_stem,
    y_test_14_5
)
log_reg_tfidf_stem_14_5.best_score_

CPU times: user 6min 12s, sys: 16.8 s, total: 6min 28s
Wall time: 5min 30s


0.8438333333333334

In [124]:
%%time
log_reg_tfidf_lem_13_45 = make_estimator(
    TfidfVectorizer(),
    LogisticRegression(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_logreg},
    'accuracy',
    X_test_lem,
    y_test_13_45
)
log_reg_tfidf_lem_13_45.best_score_

CPU times: user 6min 47s, sys: 17.5 s, total: 7min 5s
Wall time: 6min 11s


0.9123333333333334

In [125]:
%%time
log_reg_tfidf_lem_14_5 = make_estimator(
    TfidfVectorizer(),
    LogisticRegression(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_logreg},
    'accuracy',
    X_test_lem,
    y_test_14_5
)
log_reg_tfidf_lem_14_5.best_score_

CPU times: user 6min 40s, sys: 17.4 s, total: 6min 57s
Wall time: 5min 56s


0.8470000000000001

## SGD

In [19]:
%%time
sgd_clf_tfidf_stem_13_45 = make_estimator(
    TfidfVectorizer(),
    SGDClassifier(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_sgd},
    'accuracy',
    X_test_stem,
    y_test_13_45
)
sgd_clf_tfidf_stem_13_45.best_score_

CPU times: user 10min 21s, sys: 13.6 s, total: 10min 34s
Wall time: 9min 54s


0.925

In [127]:
%%time
sgd_clf_tfidf_stem_14_5 = make_estimator(
    TfidfVectorizer(),
    SGDClassifier(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_sgd},
    'accuracy',
    X_test_stem,
    y_test_14_5
)
sgd_clf_tfidf_stem_14_5.best_score_

CPU times: user 12min 32s, sys: 15.5 s, total: 12min 48s
Wall time: 12min 10s


0.8634999999999999

In [20]:
%%time
sgd_clf_tfidf_lem_13_45 = make_estimator(
    TfidfVectorizer(),
    SGDClassifier(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_sgd},
    'accuracy',
    X_test_lem,
    y_test_13_45
)
sgd_clf_tfidf_lem_13_45.best_score_

CPU times: user 11min 43s, sys: 17.9 s, total: 12min 1s
Wall time: 11min 30s


0.9251666666666667

In [26]:
sgd_clf_tfidf_lem_13_45.best_params_

{'vectorizer__use_idf': True,
 'vectorizer__sublinear_tf': True,
 'vectorizer__stop_words': None,
 'vectorizer__smooth_idf': False,
 'vectorizer__ngram_range': (1, 2),
 'vectorizer__min_df': 1,
 'vectorizer__max_df': 1.0,
 'vectorizer__binary': True,
 'vectorizer__analyzer': 'word',
 'classifier__penalty': 'elasticnet',
 'classifier__loss': 'hinge',
 'classifier__class_weight': None}

In [129]:
%%time
sgd_clf_tfidf_lem_14_5 = make_estimator(
    TfidfVectorizer(),
    SGDClassifier(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_sgd},
    'accuracy',
    X_test_lem,
    y_test_14_5
)
sgd_clf_tfidf_lem_14_5.best_score_

CPU times: user 13min 34s, sys: 17.1 s, total: 13min 51s
Wall time: 13min 50s


0.8603333333333334

## lin. svc

In [17]:
linsvc_clf_tfidf_stem_13_45 = make_estimator(
    TfidfVectorizer(),
    LinearSVC(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_linsvc},
    'accuracy',
    X_test_stem,
    y_test_13_45
)
linsvc_clf_tfidf_stem_13_45.best_score_

0.9269999999999999

In [131]:
linsvc_clf_tfidf_stem_14_5 = make_estimator(
    TfidfVectorizer(),
    LinearSVC(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_linsvc},
    'accuracy',
    X_test_stem,
    y_test_14_5
)
linsvc_clf_tfidf_stem_14_5.best_score_

0.8591666666666666

In [18]:
linsvc_clf_tfidf_lem_13_45 = make_estimator(
    TfidfVectorizer(),
    LinearSVC(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_linsvc},
    'accuracy',
    X_test_lem,
    y_test_13_45
)
linsvc_clf_tfidf_lem_13_45.best_score_

0.9271666666666667

In [21]:
linsvc_clf_tfidf_lem_13_45.best_params_

{'vectorizer__use_idf': False,
 'vectorizer__sublinear_tf': True,
 'vectorizer__stop_words': None,
 'vectorizer__smooth_idf': False,
 'vectorizer__ngram_range': (1, 2),
 'vectorizer__min_df': 1,
 'vectorizer__max_df': 1.0,
 'vectorizer__binary': True,
 'vectorizer__analyzer': 'word',
 'classifier__penalty': 'l1',
 'classifier__multi_class': 'crammer_singer',
 'classifier__loss': 'squared_hinge',
 'classifier__intercept_scaling': 1.0,
 'classifier__class_weight': 'balanced',
 'classifier__C': 1.3}

In [133]:
linsvc_clf_tfidf_lem_14_5 = make_estimator(
    TfidfVectorizer(),
    LinearSVC(random_state=111),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_linsvc},
    'accuracy',
    X_test_lem,
    y_test_14_5
)
linsvc_clf_tfidf_lem_14_5.best_score_

0.8576666666666668

## Random Forest

In [13]:
rf_clf_tfidf_13_45 = make_estimator(
    TfidfVectorizer(),
    RandomForestClassifier(random_state=111, n_jobs=4),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_randforest},
    'accuracy',
    X_test_stem,
    y_test_13_45 
)
rf_clf_tfidf_13_45.best_score_

0.9008333333333333

In [14]:
rf_clf_tfidf_14_5 = make_estimator(
    TfidfVectorizer(),
    RandomForestClassifier(random_state=111, n_jobs=4),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_randforest},
    'accuracy',
    X_test_stem,
    y_test_14_5
)
rf_clf_tfidf_14_5.best_score_

0.8246666666666667

In [15]:
rf_clf_tfidf_lem_13_45 = make_estimator(
    TfidfVectorizer(),
    RandomForestClassifier(random_state=111, n_jobs=4),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_randforest},
    'accuracy',
    X_test_lem,
    y_test_13_45
)
rf_clf_tfidf_lem_13_45.best_score_

0.9006666666666666

In [16]:
rf_clf_tfidf_lem_14_5 = make_estimator(
    TfidfVectorizer(),
    RandomForestClassifier(random_state=111, n_jobs=4),
    {**parameters_grid_vectorizer_tfidf, **parameters_grid_randforest},
    'accuracy',
    X_test_lem,
    y_test_14_5
)
rf_clf_tfidf_lem_14_5.best_score_

0.8234999999999999

## Trainig

In [106]:
%%time
pipe_1 = make_pipeline(
    vectorizer=TfidfVectorizer(ngram_range=(1, 3)),
    classifier=LinearSVC(class_weight='balanced')
)

print(cross_val_score(pipe_1, X_lem, y13_45, cv=4, n_jobs=4).mean())

0.925907258064516
CPU times: user 157 ms, sys: 125 ms, total: 282 ms
Wall time: 37.1 s


In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_lem, y13_45, test_size=0.2)

In [113]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_transf = vectorizer.fit_transform(X_lem)

X_train, X_test, y_train, y_test = train_test_split(X_transf, y13_45, test_size=0.2)

linsvc_clf = LinearSVC(class_weight='balanced')
linsvc_clf.fit(X_train, y_train)

In [115]:
print(classification_report(y_test, linsvc_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1908
           1       0.95      0.97      0.96     10790

    accuracy                           0.93     12698
   macro avg       0.88      0.84      0.86     12698
weighted avg       0.93      0.93      0.93     12698



In [117]:
print(f1_score(y_test, linsvc_clf.predict(X_test)))

0.9604587155963302


In [122]:
linsvc_clf.fit(X_transf, y13_45)

In [123]:
with open("linsvc_clf.pkl", "wb") as file:
    pickle.dump(linsvc_clf, file)

In [124]:
with open("tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)