In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import sparse
import re
from natasha import NamesExtractor
import pymorphy2

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
%matplotlib inline

In [25]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [26]:
train['first_upper'] = train['Word'].apply(lambda x: 1 if x[0].isupper() else 0)
test['first_upper'] = test['Word'].apply(lambda x: 1 if x[0].isupper() else 0)

In [27]:
train['all_upper'] = train['Word'].apply(lambda x: 1 if x.isupper() else 0)
test['all_upper'] = test['Word'].apply(lambda x: 1 if x.isupper() else 0)

In [28]:
def find_char(string):
    chars = set('#@%&*)(!?/\][$,.:; ')
    if any((c in chars) for c in string):
        return 1
    else:
        return 0
train['has_specail_char'] = train['Word'].apply(lambda x: find_char(x))
test['has_specail_char'] = test['Word'].apply(lambda x: find_char(x))

In [29]:
def has_numbers(string):
    return bool(re.search(r'\d', string))
train['has_numbers'] = train['Word'].apply(lambda x: has_numbers(x)).map({True: 1, False: 0})
test['has_numbers'] = test['Word'].apply(lambda x: has_numbers(x)).map({True: 1, False: 0})

In [30]:
train['length'] = train['Word'].apply(lambda x: len(x))
test['length'] = test['Word'].apply(lambda x: len(x))

train['vow'] = train['Word'].apply(lambda x: len(re.findall('[ауоыиэяюёе]', x, re.IGNORECASE)))
test['vow'] = test['Word'].apply(lambda x: len(re.findall('[ауоыиэяюёе]', x, re.IGNORECASE)))

train['con'] = train['length'] - train['vow'] 
test['con'] = test['length'] - test['vow']

In [31]:
train['duplicated'] = train['Word'].duplicated()
test['duplicated'] = test['Word'].duplicated()
train['duplicated'] = train['duplicated'].map({True: 1, False: 0})
test['duplicated'] = test['duplicated'].map({True: 1, False: 0})

In [32]:
func = NamesExtractor()
def function_natasha(word):
    return 1 if func(word) else 0

train['natasha_person'] = train['Word'].apply(function_natasha)
test['natasha_person'] = test['Word'].apply(function_natasha)

In [33]:
morph = pymorphy2.MorphAnalyzer()

train['pymorphy'] = train['Word'].apply(lambda x: morph.tag(x)[0])
test['pymorphy'] = test['Word'].apply(lambda x: morph.tag(x)[0])

train['pymorphy_animacy'] = train['pymorphy'].apply(lambda x: x.animacy)
train['pymorphy_POS'] = train['pymorphy'].apply(lambda x: x.POS)
train['pymorphy_case'] = train['pymorphy'].apply(lambda x: x.case)
train['pymorphy_number'] = train['pymorphy'].apply(lambda x: x.number)
train['pymorphy_gender'] = train['pymorphy'].apply(lambda x: x.gender)
test['pymorphy_animacy'] = test['pymorphy'].apply(lambda x: x.animacy)
test['pymorphy_POS'] = test['pymorphy'].apply(lambda x: x.POS)
test['pymorphy_case'] = test['pymorphy'].apply(lambda x: x.case)
test['pymorphy_number'] = test['pymorphy'].apply(lambda x: x.number)
test['pymorphy_gender'] = test['pymorphy'].apply(lambda x: x.gender)

train.drop('pymorphy' , axis=1 , inplace=True)
test.drop('pymorphy' , axis=1 , inplace=True)

columns_to_one_hot = ['pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    train[col] = LabelEncoder().fit_transform(list(train[col].fillna('nan')))
    test[col] = LabelEncoder().fit_transform(list(test[col].fillna('nan')))

In [34]:
train.shape, test.shape

((101408, 16), (188920, 15))

In [35]:
train.head(10)

Unnamed: 0,Word,Label,first_upper,all_upper,has_specail_char,has_numbers,length,vow,con,duplicated,natasha_person,pymorphy_animacy,pymorphy_POS,pymorphy_case,pymorphy_number,pymorphy_gender
0,Аалтонен,1,1,0,0,0,8,4,4,0,0,2,1,7,2,1
1,Аар,0,1,0,0,0,3,2,1,0,0,2,17,7,0,2
2,Аарон,0,1,0,0,0,5,3,2,0,1,0,8,8,2,1
3,ААРОН,0,1,1,0,0,5,3,2,0,1,0,8,8,2,1
4,Аарона,0,1,0,0,0,6,4,2,0,1,0,8,4,2,1
5,Аарона,1,1,0,0,0,6,4,2,1,1,0,8,4,2,1
6,Аароне,0,1,0,0,0,6,4,2,0,1,0,8,6,2,1
7,Ааронов,0,1,0,0,0,7,4,3,0,0,0,8,4,1,2
8,Аахена,0,1,0,0,0,6,4,2,0,0,1,8,4,2,1
9,Абабков,1,1,0,0,0,7,3,4,0,1,0,8,8,2,1


In [36]:
num_cols = ['length', 'vow', 'con', 'pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']
numeric_train = train[num_cols]
numeric_test = test[num_cols]

In [37]:
scaler = StandardScaler()
numeric_train_scaled = scaler.fit_transform(numeric_train)
numeric_test_scaled = scaler.transform(numeric_test)

In [44]:
words_train = train['Word']
words_test = test['Word']
y_train = train['Label']

train_bool_features = train.drop(num_cols, axis=1)
train_bool_features.drop(['Label', 'Word'], axis=1, inplace=True)
test_bool_features = test.drop(num_cols, axis=1)
test_bool_features.drop(['Word'], axis=1, inplace=True)

In [46]:
count_vect = CountVectorizer(analyzer='char_wb', ngram_range=(2, 8))
words_train_counts = count_vect.fit_transform(words_train)
words_test_counts = count_vect.transform(words_test)
words_train_counts.shape, words_test_counts.shape

((101408, 714496), (188920, 714496))

In [47]:
tfidf_transformer = TfidfTransformer(use_idf=True)
words_train_tfidf = tfidf_transformer.fit_transform(words_train_counts)
words_test_tfidf = tfidf_transformer.transform(words_test_counts)
words_train_tfidf.shape, words_test_tfidf.shape

((101408, 714496), (188920, 714496))

In [48]:
X_train = sparse.hstack((words_train_tfidf, train_bool_features.values))
X_test = sparse.hstack((words_test_tfidf, test_bool_features.values))

In [49]:
X_train = sparse.hstack((X_train, numeric_train_scaled))
X_test = sparse.hstack((X_test, numeric_test_scaled))

In [50]:
LR = LogisticRegression(random_state=777)
LR.fit(X_train, y_train)
np.mean(cross_val_score(LR, X_train, y_train, cv=3, scoring='roc_auc'))

0.89189588546634047

In [52]:
y_pred = LR.predict_proba(X_test)[:,1]
test['Prediction'] = y_pred
test['Word'] = test.index
test.rename(columns={'Word': 'Id'}, inplace=True)
test[['Id', 'Prediction']].to_csv('predictions.csv', index=False)