# Решение соревнования по классификации фамилий на 4 место

https://www.kaggle.com/c/dmia-surnames-classification

## Загрузка данных

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('./data/train.csv')
print(train.shape)
train.head()

(101408, 2)


Unnamed: 0,Word,Label
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


In [3]:
test = pd.read_csv('./data/test.csv')
print(test.shape)
test.head()

(101408, 2)


Unnamed: 0,Word
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


## Генерация признаков

Так как pymorphy может возвращать несколько значений к одному слову, то было принято решение кодировать признаки (часть речи, род и число) суммарными score каждого возвращаемого слова.

Кроме того, на семинаре была предложена отличная идея для признака: на предобученной модели w2v оценивать близость вектора каждого слова выборки с вектором слова __"Фамилия"__.

Функция *pymorhy_features* возвращает эти признаки:

In [4]:
def pymorhy_features(word, py_morphy_analyzer, gensim_w2v):
    
    # Часть речи
    noun = 0
    verb = 0
    adjf = 0

    # Род
    masc = 0
    femn = 0
    neut = 0

    # Число
    sing = 0
    plur = 0

    # Фамилия
    pymorphy = 0
    
    pymorphy_parse = py_morphy_analyzer.parse(word)
    
    pymorphy_values = len(pymorphy_parse)
    normal_form = pymorphy_parse[0].normal_form
    
    w2v_surname = 0
    if pymorphy_parse[0].tag.POS is not None:
        w2v_word = word.lower() + '_' + pymorphy_parse[0].tag.POS
        try: w2v_surname = model.similarity('фамилия_NOUN', w2v_word)
        except: pass
    
    for p in pymorphy_parse:

        pos = p.tag.POS
        if pos == 'NOUN': noun += p.score
        elif pos in ('VERB', 'INFN', 'GRND'): verb += p.score
        elif pos in ('ADJF', 'ADJS', 'PRTF', 'PRTS'): adjf += p.score

        gender = p.tag.gender
        if gender == 'masc': masc += p.score
        elif gender == 'femn': femn += p.score
        elif gender == 'neut': neut += p.score

        number = p.tag.number
        if number == 'sing': sing += p.score
        elif number == 'plur': plur += p.score

        if 'Surn' in p.tag: pymorphy = 1
            
    return w2v_surname, noun, verb, adjf, masc, femn, neut, sing, plur, pymorphy, pymorphy_values, normal_form

Mystem - библиотека, реализующая функционал аналогичный pymorphy, поэтому с помощью нее проверялось является ли слово фамилией или нет. Как правило, в большинстве моделей данный признак был незначимый.

In [5]:
def mystem_features(word, mystem_analyzer):
    try: 
        return int('фам' in mystem_analyzeranalyze(word)[0]['analysis'][0]['gr'].split(','))
    except: 
        return 0

Основная функция подготовки признаков. 

Идея с шипящими, свистящими звонкими и глухими была честно подсмотрена [в бейзлайне](http://nbviewer.jupyter.org/urls/kaggle2.blob.core.windows.net/forum-message-attachments/285519/8604/Baseline_SGD_ngrams%26wordfeatures.ipynb) любезно предоставленным [Nikk Reppin](https://www.kaggle.com/nikrepp)

Идея с категориальным признаком "2end" также была подсмотрена у Mikhail Solomennik [в этом бенчмарке](https://github.com/solomennikm/notes/blob/master/Easy_catboost_starter.ipynb). На основе нее было принято решение использовать Catboost с двумя категориальными фичами, который поднял мой скор примерно на 0,02.

In [6]:
import re
from natasha import NamesExtractor
import pymorphy2
from pymystem3 import Mystem
import pyphen

def prepare_dataset(df, gensim_w2v, verbose=True):
    
    print('Prepare features based on word...')
    vowels = set('аоиеёэыуюя')
    consonant = set('бвгджзйклмнпрстфхцчшщ')
    
    df.loc[:, 'word'] = df['Word'].apply(lambda x: ''.join(filter(None, re.split('\W', x))))
    
    df.loc[:, 'len'] = df['Word'].apply(len)
    df.loc[:, 'len_word'] = df['word'].apply(len)
    
    df.loc[:, 'vowels'] = df['word'].apply(lambda word: sum(letter in vowels for letter in word.lower()))
    df.loc[:, 'consonant'] = df['word'].apply(lambda word: sum(letter in consonant for letter in word.lower()))
    
    df.loc[:, 'ship'] = df['word'].apply(lambda word: sum(letter in set('жчшщ') for letter in word.lower()))
    df.loc[:, 'swist'] = df['word'].apply(lambda word: sum(letter in set('цзс') for letter in word.lower()))
    
    df.loc[:, 'zvon_parn'] = df['word'].apply(lambda word: sum(letter in set('бвгджз') for letter in word.lower()))
    df.loc[:, 'glukh_parn'] = df['word'].apply(lambda word: sum(letter in set('пфктшс') for letter in word.lower()))
    df.loc[:, 'zvon_neparn'] = df['word'].apply(lambda word: sum(letter in set('лмнрй') for letter in word.lower()))
    df.loc[:, 'glukh_neparn'] = df['word'].apply(lambda word: sum(letter in set('хцчщ') for letter in word.lower()))
    
    df.loc[:, 'capitals'] = df['Word'].apply(lambda x: sum(1 for c in x if c.isupper()))
    
    df.loc[:, 'ratio_vowels'] = df['vowels'] / df['len_word']
    df.loc[:, 'ratio_consonant'] = df['consonant'] / df['len_word']
    df.loc[:, 'ratio_capitals'] = df['capitals'] / df['len_word']
    
    df.loc[:, 'istitle'] = df['Word'].apply(lambda x: 1 if x.istitle() else 0)
    
    df.loc[:, 'cyrillic'] = df['word'].apply(
        lambda word: int(sum(letter in vowels.union(consonant) for letter in word.lower()) == len(word))
    )
    
    # pymorphy2
    print('Prepare features based on pymorphy2...')
    morph = pymorphy2.MorphAnalyzer()
    
    df.loc[:, 'w2v_surname'], \
    df.loc[:, 'noun'], df.loc[:, 'verb'], df.loc[:, 'adjf'], df.loc[:, 'masc'], \
    df.loc[:, 'femn'], df.loc[:, 'neut'], df.loc[:, 'sing'], df.loc[:, 'plur'], \
    df.loc[:, 'pymorphy'], df.loc[:, 'pymorphy_values'], df.loc[:, 'normal_form'] \
        = zip(*df['Word'].apply(lambda x: pymorhy_features(x, morph, gensim_w2v)))
    
    # pymystem3
    print('Prepare features based on pymystem3...')
    m = Mystem()
    
    df.loc[:, 'mystem'] = df.loc[:, 'word'].apply(lambda word: mystem_features(word, m))
    
    # Natasha
    print('Prepare features based on Natasha...')
    extractor = NamesExtractor()

    df.loc[:, 'Natasha'] = df['word'].apply(lambda x: 0 if len(list(extractor(x.capitalize()))) == 0 else 1)
    
    df.loc[:, '2end'] = df['Word'].str[-2:].str.lower()
    
    return df

Загрузим [предобученную модель W2V](http://rusvectores.org/ru/models/):

In [7]:
import gensim

model = gensim.models.Word2Vec.load_word2vec_format('./ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz')  

## Подготовка датасетов

Подготовим train, добавив признак "кол-во слов с аналогичной начальной формой слова"

In [8]:
%%time
train = prepare_dataset(train, model)

train = train.merge(
    train.groupby('normal_form', as_index=False).count()[['normal_form', 'Word']]\
        .rename(columns={'Word': 'count_normal'}),
    how='left',
    on='normal_form')

Prepare features based on word...
Prepare features based on pymorphy2...
Prepare features based on pymystem3...
Prepare features based on Natasha...
CPU times: user 2min 38s, sys: 2.49 s, total: 2min 41s
Wall time: 2min 46s


In [9]:
train.to_csv('./data/processed_train.csv', index=False)

Аналогично подготовим test

In [10]:
%%time
test = prepare_dataset(test, model)

test = test.merge(
    train.groupby('normal_form', as_index=False).count()[['normal_form', 'Word']]\
        .rename(columns={'Word': 'count_normal'}),
    how='left',
    on='normal_form').fillna(0)

Prepare features based on word...
Prepare features based on pymorphy2...
Prepare features based on pymystem3...
Prepare features based on Natasha...
CPU times: user 4min 55s, sys: 3.6 s, total: 4min 59s
Wall time: 5min 10s


In [11]:
test.to_csv('./data/processed_test.csv', index=False)

## Локальная валидация

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.drop(['Label', 'word', 'Word'], axis=1), train['Label'].values.flatten(), 
    test_size=0.2, stratify=train['Label'], random_state=777
)

Определим категориальные признаки для catboost

In [13]:
cat_features = np.where(X_train.dtypes == 'object')[0].tolist()
train.columns[cat_features]

Index(['plur', 'normal_form'], dtype='object')

In [20]:
from catboost import CatBoostClassifier

ctb = CatBoostClassifier(eval_metric='AUC', random_seed=777, iterations=2000, metric_period=100)
ctb.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 0.9133941	total: 95.1ms	remaining: 3m 10s
100:	learn: 0.9514141	total: 10.8s	remaining: 3m 22s
200:	learn: 0.9589788	total: 21.5s	remaining: 3m 12s
300:	learn: 0.9615960	total: 31.6s	remaining: 2m 58s
400:	learn: 0.9632501	total: 41.3s	remaining: 2m 44s
500:	learn: 0.9643556	total: 51.1s	remaining: 2m 32s
600:	learn: 0.9652963	total: 1m 1s	remaining: 2m 22s
700:	learn: 0.9661202	total: 1m 10s	remaining: 2m 11s
800:	learn: 0.9669694	total: 1m 20s	remaining: 2m
900:	learn: 0.9677741	total: 1m 30s	remaining: 1m 50s
1000:	learn: 0.9685043	total: 1m 40s	remaining: 1m 40s
1100:	learn: 0.9692169	total: 1m 50s	remaining: 1m 30s
1200:	learn: 0.9698792	total: 1m 59s	remaining: 1m 19s
1300:	learn: 0.9704543	total: 2m 9s	remaining: 1m 9s
1400:	learn: 0.9710583	total: 2m 19s	remaining: 59.5s
1500:	learn: 0.9716391	total: 2m 29s	remaining: 49.6s
1600:	learn: 0.9721533	total: 2m 38s	remaining: 39.6s
1700:	learn: 0.9725818	total: 2m 48s	remaining: 29.6s
1800:	learn: 0.9731964	total: 2m 58s	r

<catboost.core.CatBoostClassifier at 0x10fcb24e0>

Проверим качество на отложенной выборке:

In [21]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val, ctb.predict_proba(X_val)[:,1])

0.9726341794016273

Посмотрим важность признаков:

In [16]:
importances = pd.DataFrame(
    data={
        'features': X_train.columns,
        'fscore': ctb.feature_importances_
    }
)
importances = importances.reindex(importances.fscore.abs().sort_values(ascending=False).index)

importances

Unnamed: 0,features,fscore
13,ratio_capitals,22.846306
16,w2v_surname,13.039769
31,count_normal,10.223563
14,istitle,8.410677
27,normal_form,8.271064
30,2end,7.239061
10,capitals,6.242655
2,vowels,3.682739
25,pymorphy,3.681037
26,pymorphy_values,2.604975


## Применение на тестовых данных

In [22]:
%%time

cat_features = np.where(
    train.drop(['Word', 'word', 'Label'], axis=1).dtypes == 'object'
)[0].tolist()

ctb = CatBoostClassifier(eval_metric='AUC', random_seed=777, iterations=2000, metric_period=100)\
    .fit(train.drop(['Word', 'word', 'Label'], axis=1), 
         train['Label'], 
         cat_features=cat_features)

0:	learn: 0.9098501	total: 119ms	remaining: 3m 57s
100:	learn: 0.9572516	total: 11.6s	remaining: 3m 37s
200:	learn: 0.9626323	total: 23.8s	remaining: 3m 33s
300:	learn: 0.9652306	total: 35.9s	remaining: 3m 22s
400:	learn: 0.9664383	total: 47.9s	remaining: 3m 11s
500:	learn: 0.9674953	total: 1m	remaining: 3m
600:	learn: 0.9683387	total: 1m 12s	remaining: 2m 48s
700:	learn: 0.9691390	total: 1m 24s	remaining: 2m 36s
800:	learn: 0.9699549	total: 1m 36s	remaining: 2m 24s
900:	learn: 0.9705615	total: 1m 48s	remaining: 2m 12s
1000:	learn: 0.9712139	total: 2m 1s	remaining: 2m 1s
1100:	learn: 0.9718591	total: 2m 13s	remaining: 1m 48s
1200:	learn: 0.9724258	total: 2m 25s	remaining: 1m 36s
1300:	learn: 0.9729738	total: 2m 38s	remaining: 1m 24s
1400:	learn: 0.9734288	total: 2m 50s	remaining: 1m 12s
1500:	learn: 0.9738434	total: 3m 2s	remaining: 1m
1600:	learn: 0.9741983	total: 3m 14s	remaining: 48.6s
1700:	learn: 0.9746036	total: 3m 27s	remaining: 36.4s
1800:	learn: 0.9750934	total: 3m 39s	remaini

Запишем результаты

In [23]:
test = test.drop('Id', axis=1)

In [24]:
test['Prediction'] = ctb.predict_proba(test.drop(['Word', 'word'], axis=1).values)[:, 1]
test = test.reset_index().rename(columns={'index': 'Id'})
test[['Id', 'Prediction']].to_csv('./submissions/sub22.csv', index=False)
test.head()

Unnamed: 0,Id,Word,word,len,len_word,vowels,consonant,ship,swist,zvon_parn,...,sing,plur,pymorphy,pymorphy_values,normal_form,mystem,Natasha,2end,count_normal,Prediction
0,0,Аалто,Аалто,5,5,3,2,0,0,0,...,1.0,0.0,0,1,аалтый,0,0,то,0.0,0.235761
1,1,ААР,ААР,3,3,2,1,0,0,0,...,0.0,0.0,0,1,аар,0,0,ар,1.0,0.025434
2,2,Аара,Аара,4,4,3,1,0,0,0,...,0.565217,0.434783,0,17,аар,0,0,ра,1.0,0.291601
3,3,Ааре,Ааре,4,4,3,1,0,0,0,...,0.5,0.5,0,12,ааре,0,0,ре,0.0,0.339099
4,4,Аарон,Аарон,5,5,3,2,0,0,0,...,1.0,0.0,1,7,аарон,0,1,он,6.0,0.760981


In [19]:
importances = pd.DataFrame(
    data={
        'features': train.drop(['Word', 'word', 'Label'], axis=1).columns,
        'fscore': ctb.feature_importances_
    }
)
importances = importances.reindex(importances.fscore.abs().sort_values(ascending=False).index)

importances

Unnamed: 0,features,fscore
13,ratio_capitals,23.416657
14,istitle,13.116555
16,w2v_surname,12.526096
27,normal_form,8.209293
31,count_normal,8.082563
30,2end,6.998043
10,capitals,5.167944
2,vowels,4.655005
25,pymorphy,3.54497
26,pymorphy_values,2.186932


# Итоги

Что не получилось:

* Признак флага "Дубликат из другого класса" не дал прироста к качеству
* Не нашел лик, так как и не искал его - хороший урок, если добавить leak к моему решению скор на паблике был бы 0.98280: второе место)
* Target encoding в отличии от catboost не дал заметных приростов (возможно это связано с кривой реализацией)
* К сожалению, я не приручил catboost с CSR матрицами - очень хотелось добавить к данным признакам CountVectorizer, так как xgboost с такими фичами и count_vectorizer(analyzer='char_wb', ngram_range=(3, 5), lowercase=True) выдавал очень неплохой бейзлайн со скором на публичном лидерборде в районе 0.919.