In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import re
import pymorphy2
from catboost import CatBoostClassifier

pd.set_option.max_columns = None
pd.set_option.max_rows = None

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [3]:
# Запоминаем индексы дублированных пар слов.
# У первого слова из такой пары всегда в трейне 0, у второго 1.
# Можно допустить, что и в тесте будет так же. Спойлер: будет.

doub = test.groupby(['Word'])['Word'].count()
test = test.join(doub, on='Word', rsuffix='_d')
li = test[test.Word_d == 2].index
limi = li[::2]
li = li[1::2]
test = test.drop('Word_d', axis=1)

In [4]:
# А теперь чистим трейн от пар-дубликатов. 

doub = train.groupby(['Word'])['Word'].count()
train = train.join(doub, on='Word', rsuffix='_d')
            
train[train.Word_d == 2]
train = train[train.Word_d != 2].drop('Word_d', axis=1).reset_index(drop=True)

In [5]:
# Генерация дополнительных фич

train['Upper'] = train['Word'].apply(lambda x: 1 if (x[0].isupper() and x[1:].islower()) else 0)
test['Upper'] = test['Word'].apply(lambda x: 1 if (x[0].isupper() and x[1:].islower()) else 0)

train['Lower'] = train['Word'].apply(lambda x: 1 if (x.islower()) else 0)
test['Lower'] = test['Word'].apply(lambda x: 1 if (x.islower()) else 0)

train['All_upper'] = train['Word'].apply(lambda x: 1 if (x.isupper()) else 0)
test['All_upper'] = test['Word'].apply(lambda x: 1 if (x.isupper()) else 0)

train['Mixed'] = train['Word'].apply(lambda x: 1 if not x.isupper() and not x.islower() else 0)
test['Mixed'] = test['Word'].apply(lambda x: 1 if not x.isupper() and not x.islower() else 0)

train['Mixed'] = train['Mixed'] - train['Upper']
test['Mixed'] = test['Mixed'] - test['Upper']

train['Length'] = train['Word'].apply(lambda x: len(x))
test['Length'] = test['Word'].apply(lambda x: len(x))

vowels = set('уеыаоэяиюУЕЫАОЭЯИЮ')
train['Vow'] = train['Word'].apply(lambda x: len(re.findall('[уеыаоэяию]', x, re.IGNORECASE))) 
test['Vow'] = test['Word'].apply(lambda x: len(re.findall('[уеыаоэяию]', x, re.IGNORECASE)))

train['Con'] = train['Length'] - train['Vow']
test['Con'] = test['Length'] - test['Vow']

train['Noncyr'] = train['Word'].apply(lambda x: len(re.findall(r"""[!.><:;'@#~{}\[\]_+=£$%^&()?]""", x, re.IGNORECASE))) 
test['Noncyr'] = test['Word'].apply(lambda x: len(re.findall(r"""[!.><:;'@#~{}\[\]_+=£$%^&()?]""", x, re.IGNORECASE)))

train['Non-cyr'] = train['Word'].apply(lambda x: len(re.findall(r"""[\-]""", x, re.IGNORECASE))) 
test['Non-cyr'] = test['Word'].apply(lambda x: len(re.findall(r"""[\-]""", x, re.IGNORECASE)))

train['ConL'] = train['Con']/train['Length']
test['ConL'] = test['Con']/test['Length']

train['VowL'] = train['Vow']/train['Length']
test['VowL'] = test['Vow']/test['Length']

train['Num'] = train['Word'].apply(lambda x: len(re.findall('[1-90]', x, re.IGNORECASE))) 
test['Num'] = test['Word'].apply(lambda x: len(re.findall('[1-90]', x, re.IGNORECASE)))

test.Word = test.Word.str.lower()
train.Word = train.Word.str.lower()

train['Last_3'] = train['Word'].apply(lambda x: x[-3:])
test['Last_3'] = test['Word'].apply(lambda x: x[-3:])

train['Last_2'] = train['Word'].apply(lambda x: x[-4:-1])
test['Last_2'] = test['Word'].apply(lambda x: x[-4:-1])

train['Last_22'] = train['Word'].apply(lambda x: x[-2:])
test['Last_22'] = test['Word'].apply(lambda x: x[-2:])

train['Last_4'] = train['Word'].apply(lambda x: x[-4:])
test['Last_4'] = test['Word'].apply(lambda x: x[-4:])

In [6]:
train_full = train.copy()
test_full = test.copy()

print(train_full.shape, test_full.shape)
test_full.head(10)

(99958, 18) (188920, 17)


Unnamed: 0,Word,Upper,Lower,All_upper,Mixed,Length,Vow,Con,Noncyr,Non-cyr,ConL,VowL,Num,Last_3,Last_2,Last_22,Last_4
0,аалто,1,0,0,0,5,3,2,0,0,0.4,0.6,0,лто,алт,то,алто
1,аар,0,0,1,0,3,2,1,0,0,0.333333,0.666667,0,аар,аа,ар,аар
2,аара,1,0,0,0,4,3,1,0,0,0.25,0.75,0,ара,аар,ра,аара
3,ааре,1,0,0,0,4,3,1,0,0,0.25,0.75,0,аре,аар,ре,ааре
4,аарон,1,0,0,0,5,3,2,0,0,0.4,0.6,0,рон,аро,он,арон
5,аароне,1,0,0,0,6,4,2,0,0,0.333333,0.666667,0,оне,рон,не,роне
6,ааронов,1,0,0,0,7,4,3,0,0,0.428571,0.571429,0,нов,оно,ов,онов
7,аароном,1,0,0,0,7,4,3,0,0,0.428571,0.571429,0,ном,оно,ом,оном
8,аароном,1,0,0,0,7,4,3,0,0,0.428571,0.571429,0,ном,оно,ом,оном
9,аарону,1,0,0,0,6,4,2,0,0,0.333333,0.666667,0,ону,рон,ну,рону


In [7]:
# Генерация фич с помощью библиотеки pymorphy2

morph = pymorphy2.MorphAnalyzer()
ind = len(train_full)
all_data = pd.concat([train_full, test_full]).reset_index(drop=True)

all_data['pymorphy'] = all_data['Word'].apply(lambda x: morph.tag(x)[0])

all_data['pymorphy_animacy'] = all_data['pymorphy'].apply(lambda x: x.animacy)
all_data['pymorphy_POS'] = all_data['pymorphy'].apply(lambda x: x.POS)
all_data['pymorphy_case'] = all_data['pymorphy'].apply(lambda x: x.case)
all_data['pymorphy_number'] = all_data['pymorphy'].apply(lambda x: x.number)
all_data['pymorphy_gender'] = all_data['pymorphy'].apply(lambda x: x.gender)

all_data.drop('pymorphy' , axis=1 , inplace=True)

train_full = all_data[:ind].reset_index(drop=True)
test_full = all_data[ind:].reset_index(drop=True)
test_full = test_full.drop('Label' , axis=1)

morph = pymorphy2.MorphAnalyzer()

train_full['norm'] = train_full['Word'].apply(lambda x: morph.parse(x)[0].normal_form)
test_full['norm'] = test_full['Word'].apply(lambda x: morph.parse(x)[0].normal_form)

In [8]:
test_full.head()

Unnamed: 0,All_upper,Con,ConL,Last_2,Last_22,Last_3,Last_4,Length,Lower,Mixed,...,Upper,Vow,VowL,Word,pymorphy_animacy,pymorphy_POS,pymorphy_case,pymorphy_number,pymorphy_gender,norm
0,0,2,0.4,алт,то,лто,алто,5,0,0,...,1,3,0.6,аалто,,ADJS,,sing,neut,аалтый
1,1,1,0.333333,аа,ар,аар,аар,3,0,0,...,0,2,0.666667,аар,,,,,,аар
2,0,1,0.25,аар,ра,ара,аара,4,0,0,...,1,3,0.75,аара,inan,NOUN,gent,sing,masc,аар
3,0,1,0.25,аар,ре,аре,ааре,4,0,0,...,1,3,0.75,ааре,inan,NOUN,nomn,sing,femn,ааре
4,0,2,0.4,аро,он,рон,арон,5,0,0,...,1,3,0.6,аарон,anim,NOUN,nomn,sing,masc,аарон


In [9]:
train_full = train_full.drop(['Word', 'Label'], axis=1).fillna('nan')
test_full = test_full.drop(['Word'], axis=1).fillna('nan')

#Выделение категориальных фич для катбуста

cat_features = np.where(train_full.dtypes == 'object')[0].tolist()
train_full.columns[cat_features]

Index(['Last_2', 'Last_22', 'Last_3', 'Last_4', 'pymorphy_animacy',
       'pymorphy_POS', 'pymorphy_case', 'pymorphy_number', 'pymorphy_gender',
       'norm'],
      dtype='object')

In [10]:
ctb = CatBoostClassifier(random_seed=777, iterations=1000, loss_function='CrossEntropy', eval_metric='AUC', od_type='IncToDec', od_pval=0.01, max_depth=4, learning_rate=1, metric_period=10)
ctb.fit(train_full, train['Label'], cat_features=cat_features)

0:	learn: 0.8564098	total: 209ms	remaining: 3m 28s
10:	learn: 0.9522805	total: 1.42s	remaining: 2m 8s
20:	learn: 0.9563019	total: 2.7s	remaining: 2m 5s
30:	learn: 0.9572253	total: 4.06s	remaining: 2m 7s
40:	learn: 0.9585026	total: 5.46s	remaining: 2m 7s
50:	learn: 0.9604299	total: 6.75s	remaining: 2m 5s
60:	learn: 0.9616829	total: 8.38s	remaining: 2m 8s
70:	learn: 0.9620339	total: 9.71s	remaining: 2m 7s
80:	learn: 0.9626448	total: 11s	remaining: 2m 5s
90:	learn: 0.9641309	total: 12.4s	remaining: 2m 3s
100:	learn: 0.9645556	total: 13.7s	remaining: 2m 1s
110:	learn: 0.9648009	total: 15s	remaining: 2m
120:	learn: 0.9650949	total: 16.4s	remaining: 1m 58s
130:	learn: 0.9652716	total: 17.8s	remaining: 1m 57s
140:	learn: 0.9655462	total: 19.1s	remaining: 1m 56s
150:	learn: 0.9659745	total: 20.5s	remaining: 1m 55s
160:	learn: 0.9662229	total: 21.8s	remaining: 1m 53s
170:	learn: 0.9664349	total: 23.2s	remaining: 1m 52s
180:	learn: 0.9667152	total: 24.5s	remaining: 1m 50s
190:	learn: 0.9669368	t

<catboost.core.CatBoostClassifier at 0x7f972bf0f780>

In [11]:
print('CAT AUC: ', roc_auc_score(train['Label'], ctb.predict_proba(train_full)[:,1]))

CAT AUC:  0.987855808274


In [12]:
sub = pd.read_csv('sample_submission.csv')

In [13]:
sub = sub.reset_index(drop=True)
sub['Prediction'] = ctb.predict_proba(test_full)[:,1]

In [14]:
# Постобработка предсказаний, для чего мы в начале запоминали индексы

ma = sub['Prediction'].max()
mi = sub['Prediction'].min()
sub['Prediction'].loc[li] = ma
sub['Prediction'].loc[limi] = mi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
sub.head(10)

Unnamed: 0,Id,Prediction
0,0,0.005130481
1,1,0.004442368
2,2,0.2247837
3,3,0.1971415
4,4,0.03695602
5,5,0.002008027
6,6,0.08727521
7,7,5.31377e-10
8,8,1.0
9,9,5.31377e-10


In [16]:
sub.to_csv('submission_catboost.csv', columns=['Id', 'Prediction'], index=False)