In [932]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import pymorphy2
from xgboost import XGBClassifier

In [960]:
df_train=pd.read_csv('train.csv')

In [961]:
df_train.head()

Unnamed: 0,Word,Label
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


In [962]:
df_train.shape

(101408, 2)

In [963]:
df_test=pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,Word
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [964]:
df_test.shape

(188920, 1)

In [965]:
df_train['Label'].value_counts()

0    90770
1    10638
Name: Label, dtype: int64

Удалим повторяющиеся строки

In [971]:
df_train.drop_duplicates(inplace=True)

In [972]:
df_train.shape

(101407, 2)

## Признаки

In [771]:
morph = pymorphy2.MorphAnalyzer()

In [593]:
df_train['pymorphy']=[morph.parse(i)[0].normal_form for i,j in df_train[['Word','Label']].values]
df_test['pymorphy']=[morph.parse(i)[0].normal_form for i in df_test['Word'].values]

In [973]:
df_train['pymorphy2']=df_train['Word'].apply(lambda x: morph.tag(x)[0])
df_train['pymorphy_animacy'] = df_train['pymorphy2'].apply(lambda x: x.animacy)
df_train['pymorphy_case'] = df_train['pymorphy2'].apply(lambda x: x.case)
df_train['pymorphy_number'] = df_train['pymorphy2'].apply(lambda x: x.number)
df_train['pymorphy_gender'] = df_train['pymorphy2'].apply(lambda x: x.gender)
df_train['pymorphy_POS'] = df_train['pymorphy2'].apply(lambda x: x.POS)

In [974]:
df_test['pymorphy2']=df_test['Word'].apply(lambda x: morph.tag(x)[0])
df_test['pymorphy_animacy'] = df_test['pymorphy2'].apply(lambda x: x.animacy)
df_test['pymorphy_case'] = df_test['pymorphy2'].apply(lambda x: x.case)
df_test['pymorphy_number'] = df_test['pymorphy2'].apply(lambda x: x.number)
df_test['pymorphy_gender'] = df_test['pymorphy2'].apply(lambda x: x.gender)
df_test['pymorphy_POS'] = df_test['pymorphy2'].apply(lambda x: x.POS)

In [977]:
df_train['end']=df_train['pymorphy'].apply(lambda x: x[-3:].lower())
df_train['end1']=df_train['pymorphy'].apply(lambda x: x[-2:].lower())
df_train['vowels'] = df_train['Word'].apply(lambda x: len(re.findall('[уеыаоэяию]', x, re.IGNORECASE)))
df_train['upper'] = df_train['Word'].apply(lambda x: 1 if x[0].isupper() else 0)
df_train['Lower'] = df_train['Word'].apply(lambda x: 1 if (x.islower()) else 0)
df_train['All_upper'] = df_train['Word'].apply(lambda x: 1 if (x.isupper()) else 0)
df_train['len']=df_train['pymorphy'].apply(lambda x: len(x))
df_train['diff'] = df_train['len'] - df_train['vowels'] 

In [978]:
df_test['end']=df_test['pymorphy'].apply(lambda x: x[-3:].lower())
df_test['end1']=df_test['pymorphy'].apply(lambda x: x[-2:].lower())
df_test['vowels'] = df_test['Word'].apply(lambda x: len(re.findall('[уеыаоэяию]', x, re.IGNORECASE)))
df_test['upper'] = df_test['Word'].apply(lambda x: 1 if x[0].isupper() else 0)
df_test['Lower'] = df_test['Word'].apply(lambda x: 1 if (x.islower()) else 0)
df_test['All_upper'] = df_test['pymorphy'].apply(lambda x: 1 if (x.isupper()) else 0)
df_test['len']=df_test['pymorphy'].apply(lambda x: len(x))
df_test['diff'] = df_test['len'] - df_test['vowels'] 

In [664]:
y=df_train['Label']
X=df_train['end']

In [991]:
X_test=df_test['end']

In [914]:
cat_features=['end1','end','pymorphy_animacy','pymorphy_POS','pymorphy_case','pymorphy_number','pymorphy_gender']

In [906]:
le = LabelEncoder()
for i in cat_features:
    df_train[i]=le.fit_transform(list(df_train[i].fillna('nan')))

In [834]:
X_train.shape

(76055, 12)

## Обучение модели

In [907]:
X_train,X_valid,y_train,y_valid=train_test_split(np.hstack((df_train[cat_features].values,df_train[['len','upper','vowels','diff','Lower','All_upper']].values)) \
                                                 ,y,shuffle=True,stratify=y,random_state=17)

In [909]:
model = XGBClassifier(n_jobs=-1,max_depth = 10 , n_estimators=400, learning_rate=0.1 , colsample_bytree=0.9 , colsample_bylevel=0.6)
cv=StratifiedKFold(n_splits=4,shuffle=True,random_state=17)
np.mean(cross_val_score(model,X_train,y_train,scoring='roc_auc',cv=cv,n_jobs=-1))

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x000001B03D66C240>>
Traceback (most recent call last):
  File "C:\Users\Mormaks\Anaconda3\lib\site-packages\xgboost\core.py", line 368, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


0.92494665938475695

In [910]:
model.fit(X_train,y_train)
fcst_model = model.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid,fcst_model)

0.9316253351517344

## Прогноз для тестовой выборки

In [979]:
df_full = pd.concat([df_train , df_test])

In [980]:
le = LabelEncoder()
for i in cat_features:
    df_full[i]=le.fit_transform(list(df_full[i].fillna('nan')))

In [981]:
new_train = df_full[df_full['Label'].notnull()]
new_test = df_full[df_full['Label'].isnull()]

In [986]:
model = XGBClassifier(n_jobs=-1,max_depth = 10 , n_estimators=400, learning_rate=0.1 , colsample_bytree=0.9 , colsample_bylevel=0.6)
model.fit(new_train.drop(['Label','Word','pymorphy','pymorphy2'],axis=1),new_train['Label'])
fcst_model = model.predict_proba(new_test.drop(['Word','pymorphy','pymorphy2','Label'],axis=1))[:,1]

In [987]:
df_pred=pd.read_csv('sample_submission.csv')

In [988]:
df_pred['Prediction']=fcst_model

In [989]:
df_pred.head()

Unnamed: 0,Id,Prediction
0,0,0.102282
1,1,0.095187
2,2,0.031436
3,3,0.046061
4,4,0.34182


In [990]:
df_pred.to_csv('pred.csv',index=False)