In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

from sklearn.metrics import roc_auc_score
#модели
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [9]:
train_data = pd.read_csv('train_2.txt', sep = '\t')
test_data = pd.read_csv('test_2.txt', sep = '\t')

In [10]:
train_data.head()

Unnamed: 0,id,name,description,target
0,0,Заведующий отделом/секцией в магазин YORK (Уру...,<p><strong>В НОВЫЙ МАГАЗИН YORK (хозтовары) пр...,1
1,1,Наладчик станков и манипуляторов с ПУ,Обязанности:работа на токарных станках с ЧПУ T...,0
2,2,Разработчик С++ (Криптограф),<strong>Требования:</strong> <ul> <li>Опыт про...,0
3,3,Фрезеровщик,<p>Условия:</p> <ul> <li>На работу вахтовым ме...,0
4,4,Мерчендайзер/продавец-консультант,<p><strong>Компания Палладиум Стандарт - призн...,1


In [11]:
test_data.head()

Unnamed: 0,id,name,description
0,200000,Дизайнер-консультант мебели,<p><strong>Обязанности:</strong></p> <ul> <li>...
1,200001,Продавец-консультант (ТЦ на Пушкина),<p><strong>Обязанности</strong>:</p> <p>∙ конс...
2,200002,Менеджер по продажам,<p>Торговый Дом «Форт» это ведущая компания Пе...
3,200003,Продавец-консультант в магазин одежды (ТЦ Волн...,<p><strong>Требуются продавцы консультанты в м...
4,200004,Специалист по охране труда,<strong>Обязанности:</strong> <ul> <li> <p>осу...


## Baseline submission

Для baseline будем использовать только наименование вакансии

In [12]:
# train, test
train_df = train_data.drop(labels = ['id', 'description', 'target'], axis = 1)
y = train_data['target']
test_df = test_data.drop(labels = ['id', 'description'], axis = 1)

In [13]:
# Выделим выборку под кросс-валидацию и holdout выборку
X_train, X_holdout, y_train, y_holdout = train_test_split(train_df, y, test_size=0.3, random_state=42,
                                                         shuffle = True, stratify = y)

### Fit

In [14]:
# pipeline
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

In [15]:
for clf in [LogisticRegression, SGDClassifier, RandomForestClassifier, GradientBoostingClassifier]:
    print(clf)
    print(cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), 
                           list(X_train['name'].values), list(y_train.values), scoring = 'roc_auc', cv = 5).mean())
    print("\n")

<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.9823727181375345


<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>




0.9774045105052688


<class 'sklearn.ensemble.forest.RandomForestClassifier'>
0.9837360643918848


<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
0.9557107471440934




Лучшая модель на кросс-валдидации это Случайный лес. Применим случайный лес к отложенной выборке и посмотрим на ROC AUC

In [16]:
rf_text_clf_pip = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', RandomForestClassifier(random_state = 42)),
])

In [19]:
rf_text_clf_pip.fit(list(X_train['name'].values), list(y_train.values))

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [20]:
y_pred = rf_text_clf_pip.predict(list(X_holdout['name'].values))
print('ROC AUC: {}'.format(roc_auc_score(y_holdout, y_pred)))

ROC AUC: 0.9545458118845204


Обучим rf_text_clf_pip на всей тренировочной выборке: train_df, y

In [21]:
rf_text_clf_pip.fit(list(train_df['name'].values), list(y.values))

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

### Predict and Submit

In [22]:
# функция записи файла отправки
def write_submission_file(ids, targets, filename):
    submission = pd.DataFrame(data = ids, columns =['id'])
    submission['target'] = targets
    submission.to_csv(filename, sep = ',', header = True, index = False)

In [23]:
ids = test_data['id'].values
y_test_pred = rf_text_clf_pip.predict(list(test_df['name'].values))
write_submission_file(ids, y_test_pred, 'baseline_sub_0.csv')

In [24]:
preprocessing = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer())])

In [25]:
train_index = len(train_df)
all_data = pd.concat((train_df['name'], test_df['name']), axis = 0, ignore_index = True)

In [26]:
X_data = preprocessing.fit_transform(list(all_data.values))

In [27]:
X_data_train = X_data[:train_index]
X_data_test = X_data[train_index:]

In [28]:
X_train_2, X_holdout_2, y_train_2, y_holdout_2 = train_test_split(X_data_train, y, test_size=0.3, random_state=42,
                                                         shuffle = True, stratify = y)

In [29]:
model = RandomForestClassifier(random_state=10)

In [30]:
n_estimators = np.arange(85, 90, 5)
max_depth = np.arange(1, 11, 1)

params = {
   'n_estimators': n_estimators,
   'max_depth': max_depth
}


In [31]:
grid_rf = GridSearchCV(model, params, scoring = 'roc_auc', n_jobs = -1, verbose = True)

In [32]:
grid_rf.fit(X_train_2, y_train_2)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([85]), 'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=True)

In [33]:
grid_rf.best_score_

0.9430791312499536

In [34]:
grid_rf.best_params_

{'max_depth': 10, 'n_estimators': 85}

In [35]:
grid_rf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=85, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [36]:
y_pred_2 = grid_rf.best_estimator_.predict(X_holdout_2)
print('ROC AUC: {}'.format(roc_auc_score(y_holdout_2, y_pred_2)))

ROC AUC: 0.7640298279168589


In [37]:
grid_rf.best_estimator_.fit(X_data_train, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=85, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [38]:
ids = test_data['id'].values
y_test_pred_2 = grid_rf.best_estimator_.predict(X_data_test)
write_submission_file(ids, y_test_pred_2, 'baseline_sub_bestmodel.csv')