In [2]:
import pandas as pd
import numpy as np

from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 18, 5
import seaborn as sns

from natasha import NamesExtractor
import pymystem3

from scipy.sparse import csr_matrix
from scipy.sparse import hstack

## Считывание данных

In [3]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
# выделяем целевую переменную и объединяем train и test
y_train=train_df['Label']
full_df=train_df.drop('Label',axis=1).append(test_df)

In [4]:
idx_split=train_df.shape[0]

## Получение признаков

из pymorphy:

In [5]:
from sklearn.preprocessing import LabelEncoder
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [6]:
full_df['Normal form']=full_df['Word'].apply(lambda x:morph.parse(x)[0][2])

all_data = full_df['Word'].apply(lambda x: morph.tag(x)[0])

full_df['Pymorphy_animacy'] = all_data.apply(lambda x: x.animacy)
full_df['Pymorphy_POS'] = all_data.apply(lambda x: x.POS)
full_df['Pymorphy_case'] = all_data.apply(lambda x: x.case)
full_df['Pymorphy_number'] = all_data.apply(lambda x: x.number)
full_df['Pymorphy_gender'] = all_data.apply(lambda x: x.gender)

In [7]:
columns_to_one_hot = ['Pymorphy_animacy', 'Pymorphy_POS', 'Pymorphy_case','Pymorphy_number', 'Pymorphy_gender']

for col in columns_to_one_hot:
    full_df[col] = LabelEncoder().fit_transform(list(full_df[col].fillna('nan')))

Из Наташи:

In [8]:
func = NamesExtractor()
def function_natasha(word):
    return 1 if func(word) else 0

full_df['natasha_person'] = full_df['Word'].apply(function_natasha)

Из pymystem:

In [9]:
mystem = pymystem3.Mystem()

def name_from_pymystem(word):
    try:
        return 1 if 'имя' in mystem.analyze(word)[0]['analysis'][0]['gr'].split(',') else 0
    except:
        return 0

def surn_from_pymystem(word):
    try:
        return 1 if 'фам' in mystem.analyze(word)[0]['analysis'][0]['gr'].split(',') else 0
    except:
        return 0

full_df['name_from_pymystem'] = full_df['Word'].apply(name_from_pymystem)
full_df['surn_from_pymystem'] = full_df['Word'].apply(surn_from_pymystem)

из свойств слов и таблицы:

In [10]:
full_df['count']=full_df.groupby('Normal form')['Normal form'].transform(lambda x: x.count())
full_df['length']=full_df['Normal form'].apply(lambda x: len(x))
full_df['first_letter']=full_df['Word'].apply(lambda x: int(x.istitle()))

Также хотелось бы как-то отметить то, что фамилии имеют характерные окончания: -ов, -ова, -ко, -дзе и т.д.
Для этого выделим последние 4 2-граммы каждого слова и перейдём к разреженной матрице

In [11]:
def cutter(s,i):
    if((i==2) & (len(s)>=2)):
        return s[-2:]
    if(len(s)>=i):
        return s[-i:-i+2]
    return '0'

In [12]:
full_df['1']=full_df['Normal form'].apply(lambda x:cutter(x,2))
full_df['2']=full_df['Normal form'].apply(lambda x:cutter(x,3))
full_df['3']=full_df['Normal form'].apply(lambda x:cutter(x,4))
full_df['4']=full_df['Normal form'].apply(lambda x:cutter(x,5))
full_df1=full_df[['1','2','3','4']]

Ставим в соответствие каждой 2-грамме число

In [13]:
grams_flatten = full_df1.values.flatten()
a=list(np.unique(grams_flatten))

In [14]:
full_df['index1']=full_df['1'].apply(lambda x:a.index(x))
full_df['index2']=full_df['2'].apply(lambda x:a.index(x))
full_df['index3']=full_df['3'].apply(lambda x:a.index(x))
full_df['index4']=full_df['4'].apply(lambda x:a.index(x))

Получаем разреженную матрицу

In [15]:
full_df2=full_df[['index1','index2','index3','index4']]
grams_flatten = full_df2.values.flatten()

# искомая матрица
df_sparse = csr_matrix(([1] * grams_flatten.shape[0],
                                grams_flatten,
                                range(0, grams_flatten.shape[0]  + 4, 4)))[:, 1:]

Добавляем ранее полученные признаки

In [17]:
tmp = full_df[['Pymorphy_animacy', 'Pymorphy_POS',
       'Pymorphy_case', 'Pymorphy_number', 'Pymorphy_gender', 'count',
       'length', 'first_letter','natasha_person', 'name_from_pymystem',
       'surn_from_pymystem']].as_matrix()
X_train = csr_matrix(hstack([df_sparse[:idx_split,:], tmp[:idx_split,:]]))
X_test = csr_matrix(hstack([df_sparse[idx_split:,:], tmp[idx_split:,:]]))

## Построение модели

In [18]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [19]:
import xgboost as xgb

In [20]:
def validate(x, y, estimators=600, rate=0.1, seed=17,depth=10, subs=0.8, colsample=0.8,alph=0):
    model = xgb.XGBClassifier(n_estimators=estimators, learning_rate=rate, random_state=seed,
                              subsample=subs, colsample_bytree=colsample, max_depth=depth, 
                              min_child_weight=0.5,reg_alpha=alph)
    cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
    score = cross_val_score(model, x, y, scoring='roc_auc' , cv=cv)
    print (score.mean() , score.std() , '\n')

0.9585178564786234

In [21]:
%%time
validate(X_train, y_train,subs=0.8,colsample=0.7)

0.9582570062940432 0.0012969266916602804 

CPU times: user 3min 50s, sys: 200 ms, total: 3min 50s
Wall time: 3min 50s


In [23]:
my_xgb = xgb.XGBClassifier(n_estimators=600, random_state=17, max_depth=10, colsample_bytree=0.7,
                           subsample=0.8, min_child_weight=0.5,learning_rate=0.1, n_jobs=-1).fit(X_train, y_train)

# сделаем прогноз для тестовой выборки
y_test_xgb = my_xgb.predict_proba(X_test)[:, 1]

In [24]:
result = pd.read_csv('sample_submission.csv',index_col='Id')
result['Prediction'] = y_test_xgb
result.to_csv('xgb.csv',index_label='Id')