In [1]:
import pandas as pd
import matplotlib.pyplot as plt, matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import make_scorer, roc_auc_score
import pymorphy2
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer as tfidfv

import xgboost as xgb
%matplotlib inline

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
morph = pymorphy2.MorphAnalyzer()

In [4]:
def transform_data(df):
    df['word'] = df.Word.apply(lambda i : morph.parse(i.lower())[0][2])
    df['len'] = df.Word.apply(lambda i : len(i))

    df['POS'] = df.Word.apply(lambda i : str(morph.parse(i)[0].tag.POS))
    le = preprocessing.LabelEncoder()
    le = le.fit(df.POS)
    df.POS = le.transform(df.POS)

    vow = 'уеэоаыяию'
    conson = 'йцкнгшщзхъждлрпвфчсмтьб'

    def vowels(word):
        return sum(word.count(v) for v in vow + vow.upper())

    df['vowel'] = df.Word.apply(lambda i: vowels(i))

    def consonants(word):
        return sum(word.count(v) for v in conson + conson.upper())

    df['consonant'] = df.Word.apply(lambda i: consonants(i))

    def extra_c(word):
        for c in word:
            if not c.isalpha():
                return 0
        return 1
    
    df['extra_c'] = df.Word.apply(lambda i: extra_c(i))
    
    for char in vow + conson:
        df[char] = df.word.apply(lambda i: sum(1 if c == char else 0 for c in i))
    
    line = list(df.word.apply(lambda i: i[-3:]))
    vectorizer = tfidfv(min_df=1)
    X = vectorizer.fit_transform(line)
    d = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    idf = list(d[w] if w in d else -1 for w in df.word)
    df['idf'] = idf
    
    return df

train_data = transform_data(train_data)
test_data = transform_data(test_data)

In [5]:
train_data.head()

Unnamed: 0,Word,Label,word,len,POS,vowel,consonant,extra_c,у,е,...,п,в,ф,ч,с,м,т,ь,б,idf
0,Аалтонен,1,аалтонный,8,1,4,4,1,0,0,...,0,0,0,0,0,0,1,0,0,-1.0
1,Аар,0,аар,3,11,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,11.83377
2,Аарон,0,аарон,5,8,3,2,1,0,0,...,0,0,0,0,0,0,0,0,0,-1.0
3,ААРОН,0,аарон,5,8,3,2,1,0,0,...,0,0,0,0,0,0,0,0,0,-1.0
4,Аарона,0,аарон,6,8,4,2,1,0,0,...,0,0,0,0,0,0,0,0,0,-1.0


In [6]:
test_data.head()

Unnamed: 0,Word,word,len,POS,vowel,consonant,extra_c,у,е,э,...,п,в,ф,ч,с,м,т,ь,б,idf
0,Аалто,аалтый,5,1,3,2,1,0,0,0,...,0,0,0,0,0,0,1,0,0,-1.0
1,ААР,аар,3,11,2,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,11.357325
2,Аара,аар,4,8,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,11.357325
3,Ааре,ааре,4,8,3,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,-1.0
4,Аарон,аарон,5,8,3,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,-1.0


In [7]:
X_all = train_data.drop(['Word', 'word', 'Label'], axis=1)
y_all = train_data['Label']

num_test = 0.2

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                  test_size=num_test, 
                                                  random_state=42)

In [12]:
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)

In [13]:
params = {'eval_metric' : 'auc'}
model = xgb.train(params, xgb_train)

In [14]:
roc_auc_score(y_test, model.predict(xgb_test))

0.7859141942695943

In [18]:
xgb_req = xgb.DMatrix(test_data.drop(['Word', 'word'], axis=1))
predictions = model.predict(xgb_req)

In [22]:
data = {'Id' : list(range(len(predictions))), 'Prediction' : predictions}
result= pd.DataFrame(data)

In [23]:
result.head()

Unnamed: 0,Id,Prediction
0,0,0.200425
1,1,0.284759
2,2,0.149665
3,3,0.159611
4,4,0.180168


In [24]:
result.to_csv('result.csv', columns=['Id', 'Prediction'], index=False)

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [77]:
%%time

clf_ = xgb.XGBClassifier()

params = {
    'max_depth' : [3, 5, 7],
    'n_estimators' : [7, 10, 15]
}

scorer = make_scorer(roc_auc_score)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 27.9 µs


In [78]:
grid_obj = GridSearchCV(clf_, params, scoring=scorer)

In [79]:
grid_obj.fit(X_train, y_train)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5, 7], 'n_estimators': [7, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

In [81]:
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=15,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [82]:
predixtions = clf.predict(X_test)
print(roc_auc_score(y_test, predixtions))

0.527964833133968


  if diff:


In [85]:
predict = clf.predict(test_data.drop(['Word', 'word'], axis=1))

  if diff:


In [84]:
test_data.head()

Unnamed: 0,Word,word,len,POS,vowel,consonant,idf,extra_c,у,е,...,р,п,в,ф,ч,с,м,т,ь,б
0,Аалто,аалтый,5,1,3,2,12.455937,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,ААР,аар,3,11,2,1,12.050472,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,Аара,аар,4,8,3,1,12.050472,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,Ааре,ааре,4,8,3,1,12.455937,1,0,1,...,1,0,0,0,0,0,0,0,0,0
4,Аарон,аарон,5,8,3,2,11.069643,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [86]:
predict

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
pd.DataFrame.to_csv(predict, 'output.csv')