In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import pymorphy2
import pymystem3
import re
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [190]:
import multiprocessing
from numba import jit

In [3]:
def set_answer(predictions, file_name):
    dataframe = pd.DataFrame(predictions, columns=['target'], index=np.arange(200000, 370179))
    dataframe.to_csv('data/'+str(name)+'.csv', index_label='id')

In [4]:
def get_data(processed=False):
    train_data = pd.read_csv('data/train.csv', sep='\t', index_col='id')
    test_data = pd.read_csv('data/test.csv', sep='\t', index_col='id') 
    return (train_data, test_data)

In [54]:
posConv={'ADJF':'_ADJ','NOUN':'_NOUN','VERB':'_VERB', 'PRTF': '_PRT', 'GRND': '_GRND'}
morph = pymorphy2.MorphAnalyzer()

def getVacancyVector(texts):
    processed = []
    for text in texts:
        text = BeautifulSoup(text, 'lxml').get_text()    
        words=[a[0] for a in re.findall("([0-9a-zA-ZА-ЯЁа-яё]+(-[0-9a-zA-ZА-ЯЁа-яё]+)*)", text)]
        reswords=[]

        for w in words:
            wordform=morph.parse(w)[0]
            if wordform.tag.POS in ['ADJF', 'NOUN', 'VERB', 'PRTF', 'GRND']:
                reswords.append(wordform.normal_form+posConv[wordform.tag.POS])            
        
        processed.append(" ".join(reswords))
        
    return  processed

In [156]:
mystem = pymystem3.Mystem()

In [200]:
## Токенизатор 
def get_lemmas(text):    
    text = BeautifulSoup(text, 'lxml').get_text()    
    #lemms = mystem.lemmatize(text) #MyStem -- выключил, так как работает медленней
    #PyMorphy
    lemms = []    
    for w in text:
        wordform=morph.parse(w)[0]
        lemms.append(wordform.normal_form)            
    return " ".join([x for x in lemms if re.match('^[0-9a-zа-яё]+(-[0-9a-zа-яё]+)*$', x)])
       
def getVacanciesLemmas(texts):  
    texts = np.array(texts)
    #mystem.start()
    processed = np.vectorize(get_lemmas, otypes=[str])          
    #mystem.close()
    return processed(texts)

In [6]:
## Функция для красивого вывода наиболее оптимальных параметров обучения
def print_test_results(SearchCV):
    return pd.DataFrame(SearchCV.cv_results_).sort_values(by='mean_test_score', ascending=False).head(10)

In [7]:
train_data, test_data = get_data()

In [None]:
## при вызове GridSeachCV c n_jobs=-1 не сработает при вызове токенайзера в качестве сторонней функции
##https://github.com/scikit-learn/scikit-learn/issues/5115
##функция так-же падает при попытке вызвать myStem
##поэтому лемматизацию делаем предварительно

%time X = getVacanciesLemmas(train_data['name'][:1000]+' '+train_data['description'][:1000])

In [173]:
y = train_data.iloc[:10000, -1].values

## Оценка SGD Classifier

In [130]:
sgd_clf = Pipeline([('vect', CountVectorizer(min_df=3)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(random_state=42)),
])

In [131]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

In [132]:
sgd_gs_clf = GridSearchCV(sgd_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [133]:
%%time

sgd_gs_clf.fit(X, y).best_score_

Wall time: 6.17 s




0.9808415676399818

In [146]:
print_test_results(sgd_gs_clf)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_vect__ngram_range,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
1,0.85915,0.223546,0.980842,0.999539,"(1, 2)","{'vect__ngram_range': (1, 2)}",1,0.980104,0.999656,0.985266,0.999124,0.977157,0.999837,0.008662,0.003094,0.00335,0.000303
2,1.393658,0.289608,0.980392,0.999375,"(1, 3)","{'vect__ngram_range': (1, 3)}",2,0.980446,0.999746,0.986028,0.999442,0.974701,0.998936,0.082234,0.009039,0.004622,0.000334
0,0.213533,0.085749,0.97977,0.999535,"(1, 1)","{'vect__ngram_range': (1, 1)}",3,0.98041,0.999719,0.985366,0.999099,0.973532,0.999788,0.029969,0.011036,0.00485,0.00031


## Оценка Forest

In [150]:
frst_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=3)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(random_state=42, n_estimators=50))
])

In [151]:
parameters = {    
    'vect__max_df': (1.0, 0.9, 0.8)    
}

In [152]:
frst_gs_clf = GridSearchCV(frst_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [153]:
%%time

frst_gs_clf.fit(X, y).best_score_

Wall time: 8.64 s


0.9724382535428812

In [154]:
print_test_results(frst_gs_clf)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_vect__max_df,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,1.50109,0.250906,0.972438,0.999372,1.0,{'vect__max_df': 1.0},1,0.976988,0.999429,0.981932,0.998686,0.958382,1.0,0.078316,0.006134,0.010135,0.000538
2,1.437367,0.193851,0.972373,0.999272,0.8,{'vect__max_df': 0.8},2,0.97724,0.999311,0.982299,0.998505,0.957566,1.0,0.257644,0.050705,0.010664,0.000611
1,1.577833,0.271592,0.971623,0.999293,0.9,{'vect__max_df': 0.9},3,0.977551,0.999293,0.98253,0.998586,0.954771,1.0,0.091163,0.031232,0.01208,0.000577


## Оценка логистической регрессии

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(random_state=42)),
])

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__max_df': (1, 0.9, 0.8),
              'tfidf__use_idf': (True, False),
              'tfidf__smooth_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__max_iter': (5, 10, 50)
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time
gs_clf.fit(X[:400], y[:400]).best_score_

In [None]:
fr = pd.DataFrame(gs_clf.cv_results_)

In [None]:
fr.columns

In [None]:
fr.sort_values(by='mean_test_score', ascending=False).head()

In [None]:
mystem.close()