# KNN
## Features:
### кол-во гласных
### кол-во согласных 
### признак принимающий значения 0,1,2 в зависимости от того написано слово капсом, маленькими буквами или с заглавной
### частота встречи последних 3х букв(окончение слова) в фамилии
### частота встречи последних 3х букв(окончание слова) в нефамилии

In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
data_train = pd.read_csv('train.csv')
#data_test = pd.read_csv('test.csv')

surnames = data_train[data_train.Label == 1].drop(['Label'], axis=1)
words = data_train[data_train.Label == 0].drop(['Label'], axis=1)

In [2]:
vowel = 'уеёыаоэяию'
consonant = 'йцкнгшщзхъфвпрлджчсмтьб'

def count_vow(word):
    vow = 0
    for i in str(word).lower():
        if i in vowel:
            vow += 1
    return vow

def count_con(word):
    cons = 0
    for i in str(word).lower():
        if i in consonant:
            cons += 1
    return cons

def upper_lower_case(df):
    feature = []
    for word in df.Word:
        if str(word).islower():
            feature.append(0)
        elif str(word).isupper():
            feature.append(1)
        else:
            feature.append(2)
    return feature  

def word_format(df):
    df['Letter_size'] = upper_lower_case(df)
    return df

def word_vowels_cons(df):
    df['Vowel_count'] = df.Word.apply(lambda x: count_vow(x))
    df['Consonant_count'] = df.Word.apply(lambda x: count_con(x))
    return df

def ending_extract(x):
    if len(x) > 2:
        return str((x[-3] + x[-2] + x[-1])).lower()
    if len(x) > 1:
        return str(x[-2] + x[-1]).lower()
    else:
        return str(x[-1]).lower()

def ending_surname_freq_extract(x):
    return endings_surname_freq.get(ending_extract(x), 0)

def ending_word_freq_extract(x):
    return endings_word_freq.get(ending_extract(x), 0)
    
def word_ending_freq(df):
    df['Surname_ending_freq'] = df.Word.apply(lambda x: ending_surname_freq_extract(x))
    df['Word_ending_freq'] = df.Word.apply(lambda x: ending_word_freq_extract(x))
    return df

def drop_features(df):
    return df.drop(['Word'], axis=1)

def transform_features(df):
    df = word_format(df)
    df = word_vowels_cons(df)
    df = word_ending_freq(df)
    df = drop_features(df)
    return df

endings_sur_list = []
[endings_sur_list.append(ending_extract(word)) for word in surnames.Word]
endings_surname_freq = Counter(endings_sur_list)

endings_word_list = []
[endings_word_list.append(ending_extract(word)) for word in words.Word]
endings_word_freq = Counter(endings_word_list)

data_train = transform_features(data_train)
#data_test = transform_features(data_test)


In [3]:
X_all = data_train.drop(['Label'], axis=1)
y_all = data_train['Label']
print(X_all)
X_all['Letter_size'] = X_all['Letter_size'].astype(str) 
X_all = pd.get_dummies(X_all, columns=['Letter_size']) 
print(X_all)
num_test = 0.1
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

knn = KNeighborsClassifier(n_neighbors=55)
knn.fit(X_train, y_train)

predictions = knn.predict_proba(X_test)[:,1]
roc_auc_score(y_test, predictions)

0.91732123211659633

In [4]:
from sklearn.cross_validation import KFold
from numpy import mean

def run_kfold(clf):
    kf = KFold(101408, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict_proba(X_test)[:,1]
        accuracy = roc_auc_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))
    mean_outcome = mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome))

run_kfold(knn)



Fold 1 accuracy: 0.9128162685509515
Fold 2 accuracy: 0.9248060133565259
Fold 3 accuracy: 0.9122564841294727
Fold 4 accuracy: 0.9187556358076213
Fold 5 accuracy: 0.899155628493313
Fold 6 accuracy: 0.9078610234517592
Fold 7 accuracy: 0.8601110865636807
Fold 8 accuracy: 0.9040434758812292
Fold 9 accuracy: 0.9114323572357947
Fold 10 accuracy: 0.9088748693588734
Mean Accuracy: 0.9060112842829222
