In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import pymorphy2
import pymystem3
import re
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from joblib import Parallel, delayed

ModuleNotFoundError: No module named 'joblib'

In [3]:
def set_answer(predictions, file_name):
    dataframe = pd.DataFrame(predictions, columns=['target'], index=np.arange(200000, 370179))
    dataframe.to_csv('data/'+str(name)+'.csv', index_label='id')

In [4]:
def get_data(processed=False):
    train_data = pd.read_csv('data/train.csv', sep='\t', index_col='id')
    test_data = pd.read_csv('data/test.csv', sep='\t', index_col='id') 
    return (train_data, test_data)

In [5]:
## Токенизатор 

morph = pymorphy2.MorphAnalyzer()

def get_lemmas(text):    
    text = BeautifulSoup(text, 'lxml').get_text()       
    lemms = [morph.parse(w)[0].normal_form for w in text.split()]    
    return " ".join([x for x in lemms if re.match('^[0-9a-zа-яё]+(-[0-9a-zа-яё]+)*$', x)])
       
def getVacanciesLemmas(texts):  
    texts = np.array(texts)    
    processed = np.vectorize(get_lemmas, otypes=[str])              
    return processed(texts)

In [6]:
## Функция для красивого вывода наиболее оптимальных параметров обучения
def print_test_results(SearchCV):
    return pd.DataFrame(SearchCV.cv_results_).sort_values(by='mean_test_score', ascending=False).head(10)

In [7]:
train_data, test_data = get_data()

In [8]:
## при вызове GridSeachCV c n_jobs=-1 не сработает при вызове токенайзера в качестве сторонней функции
##https://github.com/scikit-learn/scikit-learn/issues/5115
##функция так-же падает при попытке вызвать myStem
##поэтому лемматизацию делаем предварительно
%time X = getVacanciesLemmas(train_data['name'][:10000]+' '+train_data['description'][:10000])

Wall time: 11min 28s


In [9]:
pd.DataFrame.to_csv(pd.DataFrame(X), 'data/X_test.csv')

In [10]:
y = train_data.iloc[:10000, -1].values

## Оценка SGD Classifier

In [40]:
sgd_clf = Pipeline([
    ('vect', CountVectorizer(min_df=3, max_df=0.9, ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=42, )),
])

In [41]:
parameters = {
    'clf__alpha': [0.0001, 0.01, 0.1, 1]
}

In [42]:
sgd_gs_clf = GridSearchCV(sgd_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [43]:
%%time

sgd_gs_clf.fit(X, y).best_score_

Wall time: 1min 10s




0.98137416012631418

In [44]:
print_test_results(sgd_gs_clf)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,11.985031,2.3119,0.981374,0.999209,0.0001,{'clf__alpha': 0.0001},1,0.979353,0.999646,0.983122,0.998869,0.981648,0.999114,0.092318,0.093001,0.001551,0.000324
1,10.510724,2.030659,0.968695,0.972146,0.01,{'clf__alpha': 0.01},2,0.969015,0.972185,0.970444,0.971524,0.966626,0.97273,1.254313,0.014418,0.001575,0.000493
2,9.780778,2.337755,0.92935,0.935249,0.1,{'clf__alpha': 0.1},3,0.927802,0.934025,0.934062,0.934801,0.926188,0.93692,1.456447,0.400191,0.003396,0.001223
3,11.553452,2.967613,0.922653,0.928979,1.0,{'clf__alpha': 1},4,0.922505,0.928802,0.927222,0.928345,0.918231,0.929789,0.205387,0.031134,0.003672,0.000602


In [25]:
sgd_clf2 = Pipeline([('vect', CountVectorizer(min_df=3, ngram_range=(1, 2))),                     
                     ('clf', SGDClassifier(random_state=42)),
])

In [18]:
parameters = {
    'vect__max_df': [1.0, 0.9, 0.8]
}

In [22]:
sgd_gs_clf2 = GridSearchCV(sgd_clf2, parameters, n_jobs=-1, scoring='roc_auc')

In [23]:
%%time

sgd_gs_clf2.fit(X, y).best_score_



Wall time: 54.1 s


0.97237440183294455

In [24]:
print_test_results(sgd_gs_clf2)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_vect__max_df,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
1,9.412436,2.098744,0.972374,0.999365,0.9,{'vect__max_df': 0.9},1,0.968646,0.999625,0.974731,0.999302,0.973748,0.999166,0.832519,0.251012,0.002667,0.000192
0,10.964054,2.41157,0.971579,0.999171,1.0,{'vect__max_df': 1.0},2,0.969302,0.998693,0.972582,0.999291,0.972854,0.999527,0.157469,0.044805,0.001614,0.000351
2,7.473973,1.770915,0.96996,0.99939,0.8,{'vect__max_df': 0.8},3,0.971509,0.999784,0.969395,0.999105,0.968976,0.99928,0.839273,0.268936,0.001108,0.000288


## Оценка Forest

In [10]:
frst_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(random_state=42))
])

In [11]:
parameters = {    
    'vect__max_df': (1.0, 0.9, 0.8),
    'vect__min_df': (1, 3, 10),
    'vect__ngram_range': [(1,1), (1,2)],
    'clf__n_estimators': (5, 10, 50, 100),
    'clf__max_depth': (5, 10, 50)
}

In [12]:
frst_gs_clf = RandomizedSearchCV(frst_clf, parameters, n_jobs=-1, scoring='roc_auc', random_state=42)

In [None]:
%%time

frst_gs_clf.fit(X, y).best_score_

In [None]:
print_test_results(frst_gs_clf)

## Оценка логистической регрессии

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(random_state=42)),
])

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__max_df': (1, 0.9, 0.8),
              'tfidf__use_idf': (True, False),
              'tfidf__smooth_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__max_iter': (5, 10, 50)
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time
gs_clf.fit(X[:400], y[:400]).best_score_

In [None]:
fr = pd.DataFrame(gs_clf.cv_results_)

In [None]:
fr.columns

In [None]:
fr.sort_values(by='mean_test_score', ascending=False).head()

In [None]:
mystem.close()