In [412]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import pipeline
from sklearn.ensemble import RandomForestClassifier

In [327]:
df = pd.read_csv('./secondname/linear_train.txt', names=['text', 'isSecondName'])

In [328]:
X = df.copy()

In [329]:
y = X.pop('isSecondName')

In [330]:
# Делим выборку на треин и тест + шафл
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [943]:
# Создаем пайплайн из Векторайзера и XGB
# ngram_range(min, max) - делим слова на части длиной от min до max
# char_wb - векторизовать по символам в слове
# min_df - минимально-допустимое количество (либо же частота) повтора нграммы в словаре
# max_df - ^
#  числа условные
ppln = pipeline.make_pipeline(
    TfidfVectorizer(ngram_range=(3, 3), analyzer='char_wb', min_df=5, max_df = 0.8, lowercase=True), 
    XGBClassifier(max_depth = 5, n_estimators=500)
)

In [944]:
# считаем roc_auc на кросс-валидации из 5 сетов
cross_val_score(ppln, X_train.text, y_train, cv=5, scoring='roc_auc')

array([0.88609515, 0.88457931, 0.88551177, 0.88996341, 0.8900861 ])

In [945]:
ppln.fit(X_train.text, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.8, min_df=5,
                                 ngram_range=(3, 3))),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=5, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=500,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                            

In [946]:
ppln.score(X_test.text, y_test)

0.9220968759861154

In [947]:
pred = ppln.predict_proba(X_test.text)[:, 1]

In [948]:
roc_auc_score(y_test, pred)

0.8877969258550228

In [949]:
X_test_final = pd.read_csv('./secondname/linear_test.txt', names=['text'])

In [950]:
X_test_final.head()

Unnamed: 0,text
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [951]:
pred = ppln.predict_proba(X_test_final.text)[:, 1]

In [952]:
pred

array([0.11942149, 0.10362686, 0.12305202, ..., 0.06026328, 0.01258462,
       0.00270013], dtype=float32)

In [953]:
answ = pd.DataFrame(data=pred, dtype=float, columns=['Answer'])

In [954]:
answ.index.name = 'Id'

In [955]:
answ.to_csv('answer.csv')

In [956]:
X_test_final.iloc[answ[answ.Answer == 1].index].to_csv('positive.csv')

In [957]:
X_test_final.iloc[answ[answ.Answer == 0].index].to_csv('negative.csv')