# Различные методы машинного обучения, проверенные на одинаковых датасетах и их сравнение

In [7]:
import os
import pymorphy2

not_normilized_fs = os.listdir('normilized_categories/')
analyzer = pymorphy2.MorphAnalyzer()

with open('train_data.txt', 'w') as td, open('test_data.txt', 'w') as test:
    for file in not_normilized_fs:
        with open('normilized_categories/' + file, 'r') as f:
            count_documents = sum(1 for _ in f)
        with open('normilized_categories/' + file) as f: 
            count_lines = 0
            for line in f.readlines():

                if count_lines < count_documents * 0.8:

                    td.write(file.replace('.txt', '') + '\t' + line)
                    count_lines = count_lines + 1
                else:
                    test.write(file.replace('.txt', '') + '\t' + line)

** SVM (linear kernel) **

In [1]:
# Обучаем SVM

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

def get_learned_svm():
    docs = []
    y = []
    with open('train_data.txt', 'r') as f:
        for line in f.readlines():
            docs.append(line.split('\t')[1])
            y.append(line.split('\t')[0])
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(docs)
    svm = SVC(kernel='linear', verbose=True)
    svm.fit(X, y)
    return svm, vectorizer.vocabulary_

svm, tf_basis = get_learned_svm()

[LibSVM]

In [7]:
docs_test = []
y_test = []
with open('test_data.txt', 'r') as f:
    for line in f.readlines():
        if (line.split('\t')[0] != 'cat_doc'):
            docs_test.append(line.split('\t')[1])
            y_test.append(line.split('\t')[0])

#vectorizer_test = TfidfVectorizer(min_df=1, vocabulary=tf_basis)
#X_test = vectorizer_test.fit_transform(docs_test)
#pred = svm.predict(X_test)
#print('На тестовых данных: ')
#print(svm.score(X_test, y_test))

На тестовых данных: 
0.479414322992


In [58]:
from tqdm import tqdm_notebook

docs = []
y = []
with open('train_data.txt', 'r') as f:
    for line in tqdm_notebook(f.readlines()):
        if (line.split('\t')[0] != 'cat_doc'):
            docs.append(line.split('\t')[1])
            y.append(line.split('\t')[0])

A Jupyter Widget




** Naive bayes ** 

In [21]:
from sklearn.naive_bayes import MultinomialNB

# Формируем тренировочное множество
X = []
y = []
with open('train_data.txt', 'r') as f:
    for line in f.readlines():
        if (len(line.split('\t')) > 1 and line.split('\t')[0] != 'cat_doc'):
            X.append(line.split('\t')[1])
            y.append(line.split('\t')[0])
        
        
# Переводим его в векторы
X_train_tfidf = TfidfVectorizer(min_df=1).fit_transform(X)


# Обучение
clf = MultinomialNB().fit(X_train_tfidf, y)

In [25]:
import numpy as np

docs_test = []
y_test = []
with open('test_data.txt', 'r') as f:
    for line in f.readlines():
        docs_test.append(line.split('\t')[1])
        y_test.append(line.split('\t')[0])

vectorizer_test = TfidfVectorizer(min_df=1, vocabulary=tf_basis)
X_test = vectorizer_test.fit_transform(docs_test)

predicted = clf.predict(X_test)
np.mean(predicted == y_test)

0.30864018556103218

In [59]:
docs_test = []
y_test = []
with open('test_data.txt', 'r') as f:
    for line in f.readlines():
        if (line.split('\t')[0] != 'cat_doc'):
            docs_test.append(line.split('\t')[1])
            y_test.append(line.split('\t')[0])

** SVM (rbf kernel) ** 

In [26]:
def get_learned_svm_def():
    docs = []
    y = []
    with open('train_data.txt', 'r') as f:
        for line in f.readlines():
            docs.append(line.split('\t')[1])
            y.append(line.split('\t')[0])
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(docs)
    svm = SVC(verbose=True)
    svm.fit(X, y)
    return svm, vectorizer.vocabulary_
svm_def, tfidf_basis_def = get_learned_svm_def()

[LibSVM]

In [28]:
docs_test = []
y_test = []
with open('test_data.txt', 'r') as f:
    for line in f.readlines():
        docs_test.append(line.split('\t')[1])
        y_test.append(line.split('\t')[0])

vectorizer_test = TfidfVectorizer(min_df=1, vocabulary=tfidf_basis_def)
X_test = vectorizer_test.fit_transform(docs_test)
pred = svm_def.predict(X_test)
print('На тестовых данных: ')
print(svm_def.score(X_test, y_test))

На тестовых данных: 
0.140330530589


** SVM (polynomial kernel) **  

In [29]:
def get_learned_svm_poly():
    docs = []
    y = []
    with open('train_data.txt', 'r') as f:
        for line in f.readlines():
            docs.append(line.split('\t')[1])
            y.append(line.split('\t')[0])
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(docs)
    svm = SVC(kernel='poly', degree=2, verbose=True)
    svm.fit(X, y)
    return svm, vectorizer.vocabulary_
svm_poly, tfidf_basis_poly = get_learned_svm_poly()

[LibSVM]

In [30]:
docs_test = []
y_test = []
with open('test_data.txt', 'r') as f:
    for line in f.readlines():
        docs_test.append(line.split('\t')[1])
        y_test.append(line.split('\t')[0])

vectorizer_test = TfidfVectorizer(min_df=1, vocabulary=tfidf_basis_poly)
X_test = vectorizer_test.fit_transform(docs_test)
pred = svm_poly.predict(X_test)
print('На тестовых данных: ')
print(svm_poly.score(X_test, y_test))

На тестовых данных: 
0.140330530589


**Linear SVM**

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())#loss='hinge', penalty='l2',
                                           # alpha=1e-3, random_state=42,
                                           # max_iter=5, tol=None)),
])
text_clf.fit(docs, y)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [61]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.54418212478920747

** Данные на которых проводилось исследование: **
* Новости за 2 года по всем категории новостного портала fontanka.ru 
* 27621 документ | 17 классов - обучение 
* 6898 документов             - тестирование 

** Результаты: **
* SVM (linear kernel)                - 47.9% 
* Naive Bayes                        - 30.8% 
* SVM (RBF kernel)                   - 14%   
* SVM (polynom with degree=2 kernel) - 14%   
* Linear SVM                         - 54.42%

##### Telegram Interface

In [None]:
import telebot
import re
import pymorphy2

#token ВСТАВИТЬ СВОЙ ТОКЕН!
bot = telebot.TeleBot(token)
analyzer = pymorphy2.MorphAnalyzer()

@bot.message_handler(content_types=["text"])
def repeat_all_messages(message): # TODO: текст сообщения нормализовать
    line = message.text
    words = (word for word in re.split('\W+', line) if len(word) > 0)
    norm_form = (analyzer.normal_forms(word)[0] for word in words)
    text = ' '.join(norm_form)
    cat = text_clf.predict([text])
    bot.send_message(message.chat.id, cat[0])

if __name__ == '__main__':
    bot.polling(none_stop=True)