In [7]:
import string
import re
import pandas as pd 
import numpy as np
import pymorphy2
import pickle
import tokenize_uk as tk
import gensim
import dill
from collections import defaultdict

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import shuffle, resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from data import lemmatize, delete_stop_words
from stop_words import get_stop_words

pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('../../data/golden_data.csv')

In [9]:
df.groupby(['y']).count()

Unnamed: 0_level_0,sentence_uk
y,Unnamed: 1_level_1
0,11114
1,11056


In [10]:
text = df['sentence_uk']
y = df['y'].values
X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=1234, stratify=y)

In [11]:
stop_words = get_stop_words('uk')
morph = pymorphy2.MorphAnalyzer(lang='uk')

def clean_data(text):
    reg = re.compile("""[\"#$%&*\-+/:;<=>@^`~…\\(\\)⟨⟩{}\[\|\]‒–—―«»“”‘’№]""")
    result = text.apply(lambda sent: [re.sub(reg, '', x) for x in sent])
    result = result.apply(lambda sent: [x for x in sent if x.strip()])

    result = result.apply(lambda sent: [x.lower() for x in sent])

    return result

In [12]:
X_train_clean = clean_data(X_train.apply(tk.tokenize_words)).apply(lambda x: lemmatize(x, morph))
X_test_clean = clean_data(X_test.apply(tk.tokenize_words)).apply(lambda x: lemmatize(x, morph))

In [13]:
vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda x: x, max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train_clean.apply(lambda x: delete_stop_words(x, stop_words)))
X_test_vec = vectorizer.transform(X_test_clean.apply(lambda x: delete_stop_words(x, stop_words)))

In [14]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal', random_state=1234)
clf.fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)
f1_score(y_test, y_pred)

0.8246696035242291

In [15]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.84      0.79      0.82      2223
          1       0.80      0.85      0.82      2211

avg / total       0.82      0.82      0.82      4434



In [16]:
confusion_matrix(y_test, y_pred)

array([[1766,  457],
       [ 339, 1872]])

### Retrain on full data

In [17]:
X_all = pd.concat([X_train_clean, X_test_clean])
y_all = np.concatenate([y_train, y_test])

In [18]:
vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda x: x, max_features=10000)
X_all_vec = vectorizer.fit_transform(X_all.apply(lambda x: delete_stop_words(x, stop_words)))

In [19]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal', random_state=1234)
clf.fit(X_all_vec, y_all)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=-1, penalty='elasticnet', power_t=0.5, random_state=1234,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [24]:
import dill as pickle

with open('../../bin/baseline_clf.pkl', 'wb') as f: 
    pickle.dump(clf, f)
with open('../../bin/bow_vect.pkl', 'wb') as f: 
    pickle.dump(vectorizer, f)