In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
import re
import pymorphy2

pd.set_option.max_columns = None
pd.set_option.max_rows = None

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
train_df = pd.read_csv('train.csv')

In [4]:
test_df = pd.read_csv('test.csv')

In [5]:
# подготовка данных для ускорения вычислений

# вычисляем окончания фамилий из 2-х и 3-х букв, которые встречаются более одного раза
end2 = [word[-2:].lower() for word in train_df[train_df['Label'] == 1]['Word'] if len(word) > 3]
end2counts = pd.DataFrame(end2)[0].value_counts()
end2unic = end2counts[end2counts.values>1]

end3 = [word[-3:].lower() for word in train_df[train_df['Label'] == 1]['Word'] if len(word) > 4]
end3counts = pd.DataFrame(end3)[0].value_counts()
end3unic = end3counts[end3counts.values>1]

# создаём словари
train_all_word = {'zzz':0}
for i, row in train_df.iterrows():
    train_all_word[row[0]] = row[1]
test_all_word = {'zzz':0}
for i, row in test_df.iterrows():
    test_all_word[row[0]] = 0

In [6]:
# pymorphy2 для вычисления фамилий, которые он узнаёт
morph = pymorphy2.MorphAnalyzer()

In [7]:
# всё слово в нижнем регистре как признак
train_df['lower'] = train_df['Word'].str.lower()
train_df['isfamily'] = [True in ['Surn' in v.tag for v in morph.parse(word)] for word in train_df['Word']]
# удаляем небуквенные символы
train_df['Word'] = train_df['Word'].str.replace('-', '')
train_df['Word'] = train_df['Word'].str.replace(' ', '')
# регистр слова
train_df['istitle'] = train_df['Word'].str.istitle()
train_df['islower'] = train_df['Word'].str.islower()
train_df['isupper'] = train_df['Word'].str.isupper()
# падежные окончания наиболее распространнённых русских фамилий в муж. и жен. роде и мн. числе
train_df['islast1'] = train_df['Word'].str[-1:].isin(['а', 'е', 'у', 'и', 'ы'])
train_df['islast2'] = train_df['Word'].str[-2:].isin(['ым', 'ой', 'ых', 'ом', 'ов', 'ам', 'ах'])
train_df['islast3'] = train_df['Word'].str[-3:].isin(['ыми', 'ами'])
# окончание входит в число часто встречающихся окончаний фамилий
train_df['is2end'] = [(word[-2:].lower() in end2unic) for word in train_df['Word']]
train_df['is3end'] = [(word[-3:].lower() in end3unic) for word in train_df['Word']]
# окончания как признаки
train_df['1end'] = train_df['Word'].str[-1:].str.lower()
train_df['2end'] = train_df['Word'].str[-2:].str.lower()
train_df['3end'] = train_df['Word'].str[-3:].str.lower()
# основы без окончаний
train_df['without1end'] = train_df['Word'].str[:-1].str.lower()
train_df['without2end'] = train_df['Word'].str[:-2].str.lower()
train_df['without3end'] = train_df['Word'].str[:-3].str.lower()
# признаки от Валентины Бирюковой
train_df['Length'] = train_df['Word'].str.len()
train_df['Vow'] = train_df['Word'].apply(lambda x: len(re.findall('[уеыаоэяию]', x, re.IGNORECASE)))
train_df['Con'] = train_df['Length'] - train_df['Vow'] 
# слово встречается в test или в train в нижнем регистре
train_df['same_in_lower'] =  [(word.lower() in train_all_word or word.lower() in test_all_word) for word in train_df['Word']]

In [8]:
test_df['lower'] = test_df['Word'].str.lower()
test_df['isfamily'] = [True in ['Surn' in v.tag for v in morph.parse(word)] for word in test_df['Word']]
test_df['Word'] = test_df['Word'].str.replace('-', '')
test_df['Word'] = test_df['Word'].str.replace(' ', '')
test_df['istitle'] = test_df['Word'].str.istitle()
test_df['islower'] = test_df['Word'].str.islower()
test_df['isupper'] = test_df['Word'].str.isupper()
test_df['islast1'] = test_df['Word'].str[-1:].isin(['а', 'е', 'у', 'и', 'ы'])
test_df['islast2'] = test_df['Word'].str[-1:].isin(['ым', 'ой', 'ых', 'ом', 'ов', 'ам', 'ах'])
test_df['islast3'] = test_df['Word'].str[-1:].isin(['ыми', 'ами'])
test_df['is2end'] = [(word[-2:].lower() in end2unic) for word in test_df['Word']]
test_df['is3end'] = [(word[-3:].lower() in end3unic) for word in test_df['Word']]
test_df['1end'] = test_df['Word'].str[-1:].str.lower()
test_df['2end'] = test_df['Word'].str[-2:].str.lower()
test_df['3end'] = test_df['Word'].str[-3:].str.lower()
test_df['without1end'] = test_df['Word'].str[:-1].str.lower()
test_df['without2end'] = test_df['Word'].str[:-2].str.lower()
test_df['without3end'] = test_df['Word'].str[:-3].str.lower()
test_df['Length'] = test_df['Word'].str.len()
test_df['Vow'] = test_df['Word'].apply(lambda x: len(re.findall('[уеыаоэяию]', x, re.IGNORECASE)))
test_df['Con'] = test_df['Length'] - test_df['Vow'] 
test_df['same_in_lower'] =  [(word.lower() in train_all_word or word.lower() in test_all_word) for word in test_df['Word']]

In [9]:
# tfidf от Кирилла Тушина
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = pd.concat([train , test])
bigram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,10), max_features = 100 , encoding='utf8')
for_tf_idf = bigram_vectorizer.fit_transform(all_data.Word.apply(str.lower).tolist())
tfidf_sparse = TfidfTransformer().fit_transform(for_tf_idf)
tfidf = pd.DataFrame(tfidf_sparse.toarray() , index=all_data.index, columns=['tf_idf_' + str(i) for i in range(tfidf_sparse.shape[1])])

In [10]:
train_df = pd.concat([train_df, tfidf.iloc[train.index]], axis=1)
test_df = pd.concat([test_df, tfidf.iloc[len(train.index)+test.index]], axis=1)

In [11]:
train_df.head(10)

Unnamed: 0,Word,Label,lower,isfamily,istitle,islower,isupper,islast1,islast2,islast3,...,tf_idf_90,tf_idf_91,tf_idf_92,tf_idf_93,tf_idf_94,tf_idf_95,tf_idf_96,tf_idf_97,tf_idf_98,tf_idf_99
0,Аалтонен,1,аалтонен,False,True,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Аар,0,аар,False,True,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Аарон,0,аарон,True,True,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ААРОН,0,аарон,True,False,False,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Аарона,0,аарона,True,True,False,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Аарона,1,аарона,True,True,False,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Аароне,0,аароне,True,True,False,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Ааронов,0,ааронов,True,True,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Аахена,0,аахена,False,True,False,False,True,False,False,...,0.351067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Абабков,1,абабков,True,True,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
test_df.head(10)

Unnamed: 0,Word,lower,isfamily,istitle,islower,isupper,islast1,islast2,islast3,is2end,...,tf_idf_90,tf_idf_91,tf_idf_92,tf_idf_93,tf_idf_94,tf_idf_95,tf_idf_96,tf_idf_97,tf_idf_98,tf_idf_99
0,Аалто,аалто,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ААР,аар,False,False,False,True,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Аара,аара,False,True,False,False,True,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Ааре,ааре,False,True,False,False,True,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Аарон,аарон,True,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Аароне,аароне,True,True,False,False,True,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Ааронов,ааронов,True,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Аароном,аароном,True,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Аароном,аароном,True,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Аарону,аарону,True,True,False,False,True,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
y = train_df['Label']
X = train_df.drop('Label', axis=1)
X = X.drop('Word', axis=1)

In [14]:
test_df = test_df.drop('Word', axis=1)

In [15]:
cat_features = np.where(X.dtypes == 'object')[0].tolist()
print("\n".join(X.columns[cat_features]))

lower
1end
2end
3end
without1end
without2end
without3end


In [16]:
ctb = CatBoostClassifier(random_seed=17, iterations=4000, loss_function='CrossEntropy', eval_metric='AUC')
ctb.fit(X, y, cat_features=cat_features);

In [17]:
pd.DataFrame(X.columns, ctb.feature_importances_).sort_index(ascending=False)

Unnamed: 0,0
38.687142,istitle
22.387268,islower
3.633440,same_in_lower
3.585233,lower
3.450394,without1end
3.064164,Vow
2.063866,2end
1.709094,is3end
1.679409,3end
1.565601,isfamily


In [18]:
result = ctb.predict_proba(test_df)

In [19]:
result = pd.DataFrame(result[:,1].astype(float),columns=['Prediction'])
result.to_csv('result_catboost12_prob.csv', index_label='Id'  )
dfa = pd.read_csv('test.csv', sep=',')
dfa['Prediction'] = result['Prediction'] 
dfa.to_csv('result_catboost_words12_prob.csv', index_label='Id')

In [20]:
test_a = pd.read_csv('test.csv', sep=',')
for i, row in test_a.iterrows():
    # если в test подряд идёт два одинаковых слова, то первому ставим 0, второму 1
    if i > 0 and test_a.iloc[i-1, 0] == row[0]:
        result.iloc[i-1] = 0
        result.iloc[i] = 1
    else:
        # если слово из test есть в train, то ставим ему противоположное значение
        if row[0] in train_all_word:
            if train_all_word[row[0]] == 0:
                result.iloc[i] = 1
            else:
                result.iloc[i] = 0

In [21]:
result.to_csv('result_catboost12_prob+post_step.csv', index_label='Id')
dfa['Prediction'] = result['Prediction'] 
dfa.to_csv('result_catboost_words12_prob+post_step.csv', index_label='Id')

In [None]:
df1 = pd.read_csv('result_catboost15_prob+post_step.csv', sep=',')
df2 = pd.read_csv('result_catboost14_prob+post_step.csv', sep=',')
df3 = pd.read_csv('result_catboost13_prob+post_step.csv', sep=',')
df4 = pd.read_csv('result_catboost12_prob+post_step.csv', sep=',')
df5 = pd.read_csv('result_catboost11_prob+post_step.csv', sep=',')
df6 = pd.read_csv('submission_xgb_post_proc_1.csv', sep=',')

result_catboost12_prob+post_step.csv	LB: 0.97259<br>
result_catboost11_prob+post_step.csv	LB: 0.97212<br>
result_catboost14_prob+post_step.csv	LB: 0.95288<br>
result_catboost13_prob+post_step.csv	LB: 0.95207<br>
submission_xgb_post_proc_1.csv 	LB: 0.94748<br>
result_catboost15_prob+post_step.csv	LB: 0.94554<br>

result_average.csv LB: 0.97285<br>
result_average2.csv LB: 0.97291 - эту посылку я не делал (ранее убедился, что усреднение по двум значениям не лучшая идея, но в данном случае оно даёт чуть лучший результат)

In [None]:
aver = (df1.Prediction+df2.Prediction+df3.Prediction+df4.Prediction+df5.Prediction+df6.Prediction)/6

In [None]:
dfa = pd.read_csv('sample_submission.csv', sep=',')
dfa['Prediction'] = aver
dfa.to_csv('result_average.csv', index=False)

In [None]:
aver = (df4.Prediction+df5.Prediction)/2

In [None]:
dfa = pd.read_csv('sample_submission.csv', sep=',')
dfa['Prediction'] = aver
dfa.to_csv('result_average2.csv', index=False)