In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import pymorphy2
import pymystem3
import re
import numpy as np

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [4]:
from joblib import Parallel, delayed

In [5]:
def set_answer(predictions, file_name):
    dataframe = pd.DataFrame(predictions, columns=['target'], index=np.arange(200000, 370179))
    dataframe.to_csv('data/'+str(name)+'.csv', index_label='id')

In [6]:
def get_data(processed=False):
    train_data = pd.read_csv('data/train.csv', sep='\t', index_col='id')
    test_data = pd.read_csv('data/test.csv', sep='\t', index_col='id') 
    return (train_data, test_data)

In [126]:
## Токенизатор 

morph = pymorphy2.MorphAnalyzer()

def get_lemmas(text):    
    text = BeautifulSoup(text, 'lxml').get_text()       
    lemms = [morph.parse(w)[0].normal_form for w in text.split()]    
    return " ".join([x for x in lemms if re.match('^[0-9a-zа-яё]+(-[0-9a-zа-яё]+)*$', x)])
       
def getVacanciesLemmas(texts):  
    texts = np.array(texts)    
    processed = np.vectorize(get_lemmas, otypes=[str])              
    return processed(texts)

In [56]:
## Функция для красивого вывода наиболее оптимальных параметров обучения
def print_test_results(SearchCV):
    return pd.DataFrame(SearchCV.cv_results_).sort_values(by='mean_test_score', ascending=False).head(10)

In [19]:
train_data, test_data = get_data()

In [None]:
## при вызове GridSeachCV c n_jobs=-1 не сработает при вызове токенайзера в качестве сторонней функции
##https://github.com/scikit-learn/scikit-learn/issues/5115
##функция так-же падает при попытке вызвать myStem
##поэтому лемматизацию делаем предварительно
%time X = getVacanciesLemmas(train_data['name'][:10000]+' '+train_data['description'][:10000])

In [None]:
y = train_data.iloc[:10000, -1].values

## Оценка SGD Classifier

In [None]:
sgd_clf = Pipeline([('vect', CountVectorizer(min_df=3)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(random_state=42)),
])

In [None]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

In [None]:
sgd_gs_clf = GridSearchCV(sgd_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time

sgd_gs_clf.fit(X, y).best_score_

In [None]:
print_test_results(sgd_gs_clf)

## Оценка Forest

In [None]:
frst_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=3)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(random_state=42, n_estimators=50))
])

In [None]:
parameters = {    
    'vect__max_df': (1.0, 0.9, 0.8)    
}

In [None]:
frst_gs_clf = GridSearchCV(frst_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time

frst_gs_clf.fit(X, y).best_score_

In [None]:
print_test_results(frst_gs_clf)

## Оценка логистической регрессии

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(random_state=42)),
])

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__max_df': (1, 0.9, 0.8),
              'tfidf__use_idf': (True, False),
              'tfidf__smooth_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__max_iter': (5, 10, 50)
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time
gs_clf.fit(X[:400], y[:400]).best_score_

In [None]:
fr = pd.DataFrame(gs_clf.cv_results_)

In [None]:
fr.columns

In [None]:
fr.sort_values(by='mean_test_score', ascending=False).head()

In [None]:
mystem.close()