In [1]:
import pandas as pd

from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from natasha import NamesExtractor

import xgboost as xgb

import pymorphy2

## Read the data

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,Word,Label
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


In [4]:
morph = pymorphy2.MorphAnalyzer()

vowel = u'уеёыаоэяию'
consonant = u'йцкнгшщзхъфвпрлджчсмтьб'
ru_letter = vowel + consonant + u"-’'"

def count_vow(word):
    vow = 0
    for i in word:
        if i in vowel:
            vow += 1
    return vow

def count_con(word):
    cons = 0
    for i in word:
        if i in consonant:
            cons += 1
    return cons

def is_only_ru(word):
    return len(set(word) - set(ru_letter)) == 0

def feature_extraction(data):
    df = data.copy()
    
    df['starts_with_upper'] = df.Word.map(lambda x: x[0].isupper())
    df['length'] = df.Word.map(len)
    df['caps'] = df.Word.str.isupper()
    
    df['lower_word'] = df.Word.str.lower()
    
    df['1s'] = df.lower_word.map(lambda x: x[-1])
    df['2s'] = df.lower_word.map(lambda x: x[-2:-1])
    df['3s'] = df.lower_word.map(lambda x: x[-3:-2])
    
    df['2gr'] = df.lower_word.map(lambda x: x[-2:])
    df['4gr'] = df.lower_word.map(lambda x: x[-4:])
    df['POS'] = df.lower_word.map(lambda x: morph.parse(x)[0].tag.POS)
    df['Number'] = df.lower_word.map(lambda x: morph.parse(x)[0].tag.number)
    
    df['2gr_norm'] = df.Word.map(lambda x: morph.parse(x)[0].normal_form)
    
    df['vow_count'] = data['Word'].apply(lambda x: count_vow(x))
    df['cons_count'] = data['Word'].apply(lambda x: count_con(x))
    df['russian']  = data['Word'].apply(lambda x: is_only_ru(x)==1)
    
    extractor = NamesExtractor()
    df['NatashaName'] = df.Word.map(lambda text: bool(extractor(text)))
    
    return df

In [5]:
COLS_TO_RETAIN = ['1s', '2s', '3s', '2gr', 'caps', 'length', 'starts_with_upper', 'POS', 'NatashaName', 'Number',
                 'vow_count', 'cons_count', 'russian']

def transform_data(data, transformer = None):
    cat_dict = data[COLS_TO_RETAIN].to_dict(orient = 'records')

    if not transformer:
        transformer = DictVectorizer(sparse=False)
        transformer.fit(cat_dict)
    X = transformer.transform(cat_dict)

    X = pd.DataFrame(X)
    X.fillna(0, inplace=True)
    
    return X, transformer

In [6]:
data = feature_extraction(train)
X, transformer = transform_data(data)

In [None]:
clf = RandomForestClassifier()

In [None]:
%time
param_grid = {
    'max_depth': [20, 25, 30],
    'criterion': ['entropy', 'gini'],
    'class_weight': [{1: 1}, {1: 2} {1: 10}]
}

print('Accuracy best params and score')
result = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc').fit(X, data.Label)
print('\tParams:', result.best_params_)
print('\tScore:', result.best_score_)

In [None]:
clf = RandomForestClassifier(max_depth=25, criterion='entropy', class_weight={1: 1})
cross_val_score(clf, X, data.Label, cv=5, scoring='roc_auc' )

array([ 0.91429764,  0.90971751,  0.9067754 ,  0.87286923,  0.90549927])

In [None]:
gb_clf = xgb.XGBClassifier(colsample_bytree=0.97, max_depth=10, n_estimators=90, subsample=0.97,
                          tree_method = 'gpu_hist', predictor = 'gpu_predictor')
#gb_clf = GradientBoostingClassifier(random_state=1488, max_depth=10, n_estimators=90, subsample=0.97)
cross_val_score(gb_clf, X, data.Label, cv=5, scoring='roc_auc' )

In [None]:
#from sklearn.ensemble import GradientBoostingClassifier
#gb_clf = GradientBoostingClassifier(random_state=1488)

param_grid = {
    'loss' : ['deviance', 'exponential'],
    'n_estimators': [20, 50, 100, 150, 200],
    'max_depth': [3, 5, 7]
}

print('Accuracy best params and score')
result = GridSearchCV(gb_clf, param_grid, cv=5, scoring='roc_auc').fit(X, data.Label)
print('\tParams:', result.best_params_)
print('\tScore:', result.best_score_)

In [None]:
cross_val_score(gb_clf, X, data.Label, cv=5, scoring='roc_auc' )

In [None]:
clf = RandomForestClassifier(criterion='entropy', class_weight={1: 5}, max_depth=25)

In [8]:
test = pd.read_csv('test.csv')
test.columns = ['Word']

test_data = feature_extraction(test)

In [9]:
X_test, _ = transform_data(test_data, transformer)

In [10]:
clf.fit(X, data.Label)

NameError: name 'clf' is not defined

In [None]:
predict = clf.predict(X_test)

In [None]:
test['Label'] = predict

In [None]:
result = pd.DataFrame()
result['Prediction'] = test.Label
result['Id'] = test.index

In [None]:
result.to_csv('result19.02_xgb.csv', index=False)