Приведение всех букв к нижнему регистру

In [1]:
import io

with io.open("sentences.txt", encoding='cp1251') as f, open ("lower_sentences.txt", "w") as w:
    for line in f:
        w.write(line.lower().encode('cp1251'))

f.close()
w.close()

Составление списка предложений

In [2]:
import re
with io.open("lower_sentences.txt", encoding='cp1251') as f:

    sent_list = []

    for line in list(f):
        sent_list.append(re.split(u'[^а-я]', line))

f.close()

Приведение всех слов к начальной форме, составление списка всех слов

In [4]:
import pymorphy2

def norm(x):
    morph = pymorphy2.MorphAnalyzer()
    p = morph.parse(x)[0]
    return p.normal_form


words_list = []
norm_sent_list = []

for i, sent in enumerate(sent_list):
    norm_sent_list.append([])
    for word in sent:
        norm_word = norm(word)
        norm_sent_list[i].append(norm_word)
        if norm_word != '':
            words_list.append(norm_word)

Составление словаря: слово (ключ) - сколько раз оно встречается в предложениях (значение)

In [5]:
words_dict = {}

for word in words_list:
    if word not in words_dict:
        words_dict[word] = 1
    else:
        words_dict[word] += 1

Посчитаем количество всех предложений и уникальных слов

In [6]:
print len(norm_sent_list)  # sent num = 120
print len(words_dict)  # unique words num = 1078

120
1078


Составление списка списков: [сколько раз встречается слово, слово]

In [7]:
words_freq = []

for word in words_dict:
    words_freq.append([words_dict[word], word])
    
words_freq.sort(reverse=True)

Выберем 70 наиболее встречающихся во всех предложениях слов

In [8]:
top_words = words_freq[:70]

Составим матрицу, отражающую, сколько раз каждое из этих 70 слов встретилось в каждом предложении

In [9]:
X = []

for i, sent in enumerate(norm_sent_list):
    X.append([])
    for word in top_words:
        X[i].append(sent.count(word[1]))

Считаем из файла ответы и разобьем полученные данные на обучающую и тестовую выборки

In [10]:
from sklearn.model_selection import train_test_split

with io.open("y.txt") as f:
    y = []
    for line in list(f):
        y.append(int(line))
f.close()

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.3, shuffle=True)

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Multinomial Naive Bayes

In [12]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

param_grid = {'alpha': [0.0001, 0.001, 0.01, 1, 10, 100, 1000], 'fit_prior': [True, False]}

multi = MultinomialNB()
optimizer = GridSearchCV(multi, param_grid, cv=4, scoring='f1')
optimizer.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0001, 0.001, 0.01, 1, 10, 100, 1000], 'fit_prior': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [13]:
# MNB performance evaluation

print optimizer.best_params_

y_predicted = optimizer.predict(X_test)

print 'test accuracy score =', accuracy_score(y_test, y_predicted)
print "test f1 score =", f1_score(y_test, y_predicted)

y_predicted = optimizer.predict(X_train)

print 'train accuracy score =', accuracy_score(y_train, y_predicted)
print "train f1 score =", f1_score(y_train, y_predicted)

{'alpha': 1, 'fit_prior': True}
test accuracy score = 0.7777777777777778
test f1 score = 0.7777777777777778
train accuracy score = 0.9166666666666666
train f1 score = 0.9195402298850575


# Logistic Regression

In [14]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

param_grid = {'C': [0.001, 0.01, 0.1, 0.05, 0.5, 1, 5, 10, 100, 1000], 'penalty': ['l1', 'l2'], 'fit_intercept': [True, False], 'intercept_scaling': [0.5, 0.7, 1, 1.2, 1.5, 2]}

lr = LogisticRegression()
optimizer = GridSearchCV(lr, param_grid, cv=4, n_jobs=5, scoring='f1')
optimizer.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 0.05, 0.5, 1, 5, 10, 100, 1000], 'intercept_scaling': [0.5, 0.7, 1, 1.2, 1.5, 2], 'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [15]:
# LR performance evaluation

print optimizer.best_params_

y_predicted = optimizer.predict(X_test)

print 'test accuracy score =', accuracy_score(y_test, y_predicted)
print "test f1 score =", f1_score(y_test, y_predicted)

y_predicted = optimizer.predict(X_train)

print 'train accuracy score =', accuracy_score(y_train, y_predicted)
print "train f1 score =", f1_score(y_train, y_predicted)

{'penalty': 'l2', 'C': 0.1, 'intercept_scaling': 0.5, 'fit_intercept': False}
test accuracy score = 0.8055555555555556
test f1 score = 0.8108108108108107
train accuracy score = 0.9285714285714286
train f1 score = 0.9318181818181819


# SVM

In [16]:
# SVM

from sklearn.svm import SVC

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear']}

clf = SVC()
optimizer = GridSearchCV(clf, param_grid, cv=4, n_jobs=5, scoring='f1')
optimizer.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'kernel': ['rbf', 'linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [17]:
# SVM performance evaluation

print optimizer.best_params_

y_predicted = optimizer.predict(X_test)

print 'test accuracy score =', accuracy_score(y_test, y_predicted)
print "test f1 score =", f1_score(y_test, y_predicted)

y_predicted = optimizer.predict(X_train)

print 'train accuracy score =', accuracy_score(y_train, y_predicted)
print "train f1 score =", f1_score(y_train, y_predicted)

{'kernel': 'rbf', 'C': 10, 'gamma': 0.01}
test accuracy score = 0.8333333333333334
test f1 score = 0.8125
train accuracy score = 0.9642857142857143
train f1 score = 0.963855421686747


# kNN

In [18]:
# kNN

from sklearn.neighbors import KNeighborsClassifier

param_grid = dict(n_neighbors=list(range(1, 31)), weights=['uniform', 'distance'], p=[1, 2, 3], algorithm=['auto'])

clf = KNeighborsClassifier()
optimizer = GridSearchCV(clf, param_grid, cv=4, n_jobs=5, scoring='f1')
optimizer.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'p': [1, 2, 3], 'weights': ['uniform', 'distance'], 'algorithm': ['auto'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [19]:
# kNN performance evaluation

print optimizer.best_params_

y_predicted = optimizer.predict(X_test)

print 'test accuracy score =', accuracy_score(y_test, y_predicted)
print "test f1 score =", f1_score(y_test, y_predicted)

y_predicted = optimizer.predict(X_train)

print 'train accuracy score =', accuracy_score(y_train, y_predicted)
print "train f1 score =", f1_score(y_train, y_predicted)

{'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'p': 3}
test accuracy score = 0.6666666666666666
test f1 score = 0.625
train accuracy score = 1.0
train f1 score = 1.0
