In [1]:
import pandas as pd
import numpy as np
import re
import string
import warnings
import ast
import xgboost
import pymorphy2
import nltk

from collections import Counter
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold

tqdm.pandas()

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

warnings.simplefilter('ignore')

# Load an preprocessing data

## Load some preprocessing data (with "tokens_text" and	"part_of_speech" columns)

In [2]:
tmp_tmp = pd.read_csv('tex_data_part_of_speech.csv', index_col='ID')
tmp_tmp.head()

Unnamed: 0_level_0,Question,tokens_text,part_of_speech
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,"['Как', 'зовут', 'лодочника', 'реке', 'Стикс', 'древнегреческой', 'мифологии', '?']","['INTJ', 'NOUN', 'NOUN', 'CONJ', 'ADP', 'ADP', 'NOUN', 'PUNCT']"
2,Как в химии обозначается свинец?,"['Как', 'химии', 'обозначается', 'свинец', '?']","['INTJ', 'CONJ', 'INTJ', 'ADP', 'PUNCT']"
3,Какой химический элемент преобладает в составе каменного угля?,"['Какой', 'химический', 'элемент', 'преобладает', 'составе', 'каменного', 'угля', '?']","['INTJ', 'CONJ', 'CONJ', 'CONJ', 'ADP', 'ADP', 'ADP', 'PUNCT']"
4,Кто из перечисленных был пажом во времена Екатерины II?,"['Кто', 'перечисленных', 'пажом', 'времена', 'Екатерины', 'II', '?']","['ADP', 'CONJ', 'INTJ', 'ADP', 'INTJ', 'CONJ', 'PUNCT']"
5,Когда началась 2 мировая война?,"['Когда', 'началась', '2', 'мировая', 'война', '?']","['ADP', 'DET', 'NUM', 'PART', 'ADP', 'PUNCT']"


## Separation data on parts of speech for new features 

In [3]:
morph = pymorphy2.MorphAnalyzer()

def text_tag_pos(text):
    lst = []
    lst_normal_form = []
    tokens = nltk.wordpunct_tokenize(text)
    for i in tokens:
        p = morph.parse(i)[0]
        lst.append(p.tag.POS)
        lst_normal_form.append(p.normal_form)
    return Counter(lst)

In [4]:
tmp_tmp['part_of_speech'] = tmp_tmp['Question'].progress_apply(text_tag_pos)

100%|███████████████████████████████████████████████████████████████████████████| 41086/41086 [00:57<00:00, 709.11it/s]


In [5]:
tmp_tmp.head(2)

Unnamed: 0_level_0,Question,tokens_text,part_of_speech
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,"['Как', 'зовут', 'лодочника', 'реке', 'Стикс', 'древнегреческой', 'мифологии', '?']","{'CONJ': 1, 'VERB': 1, 'NOUN': 4, 'PREP': 2, 'ADJF': 1, None: 1}"
2,Как в химии обозначается свинец?,"['Как', 'химии', 'обозначается', 'свинец', '?']","{'CONJ': 1, 'PREP': 1, 'NOUN': 2, 'VERB': 1, None: 1}"


In [6]:
def check_pos_tag(x: dict, key: str) -> int:
    if key in x:
        return x[key]
    return 0

In [7]:
tmp_tmp['NOUN'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'NOUN'))
tmp_tmp['ADJF'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'ADJF'))
tmp_tmp['ADJS'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'ADJS'))
tmp_tmp['COMP'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'COMP'))
tmp_tmp['VERB'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'VERB'))
tmp_tmp['INFN'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'INFN'))
tmp_tmp['PRTF'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'NPRTF'))
tmp_tmp['PRTS'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'PRTS'))
tmp_tmp['GRND'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'GRND'))
tmp_tmp['NUMR'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'NUMR'))
tmp_tmp['ADVB'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'ADVB'))
tmp_tmp['NPRO'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'NPRO'))
tmp_tmp['PRED'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'PRED'))
tmp_tmp['PREP'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'PREP'))
tmp_tmp['CONJ'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'CONJ'))
tmp_tmp['PRCL'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'PRCL'))
tmp_tmp['INTJ'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, 'INTJ'))
tmp_tmp['None'] = tmp_tmp['part_of_speech'].apply(lambda x: check_pos_tag(x, None))

In [8]:
tmp_tmp.head(2)

Unnamed: 0_level_0,Question,tokens_text,part_of_speech,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,"['Как', 'зовут', 'лодочника', 'реке', 'Стикс', 'древнегреческой', 'мифологии', '?']","{'CONJ': 1, 'VERB': 1, 'NOUN': 4, 'PREP': 2, 'ADJF': 1, None: 1}",4,1,0,0,1,0,0,0,0,0,0,0,0,2,1,0,0,1
2,Как в химии обозначается свинец?,"['Как', 'химии', 'обозначается', 'свинец', '?']","{'CONJ': 1, 'PREP': 1, 'NOUN': 2, 'VERB': 1, None: 1}",2,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1


In [9]:
tmp_tmp.shape

(41086, 21)

## Load second part of data

In [10]:
text_data = pd.read_csv('data.csv', comment='#', sep=';', index_col='ID')
text_data.head()

Unnamed: 0_level_0,Question
ID,Unnamed: 1_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?
2,Как в химии обозначается свинец?
3,Какой химический элемент преобладает в составе каменного угля?
4,Кто из перечисленных был пажом во времена Екатерины II?
5,Когда началась 2 мировая война?


In [11]:
text_data.tail(2)

Unnamed: 0_level_0,Question
ID,Unnamed: 1_level_1
41086,Сколько раз магнитогорский металлург становился обладателем кубка гагарина
41087,Какая численность людей в 2018 году?


In [12]:
text_data.shape

(41087, 1)

In [13]:
test_id = pd.read_csv('test.csv')
test_id.head()

Unnamed: 0,ID
0,30001
1,30002
2,30003
3,30004
4,30005


In [14]:
test_id.shape

(11087, 1)

In [15]:
train_id_answer = pd.read_csv('train.csv', sep=';', index_col='ID')
train_id_answer.head()

Unnamed: 0_level_0,Answer
ID,Unnamed: 1_level_1
1,0
2,1
3,0
4,0
5,0


In [16]:
train_id_answer.shape

(30000, 1)

In [17]:
test = text_data.iloc[30000:]

In [18]:
test.shape

(11087, 1)

In [19]:
test.head()

Unnamed: 0_level_0,Question
ID,Unnamed: 1_level_1
30001,"Что в литературе обозначает слово ""эпитет""?"
30002,Когда отмечается ДЕНЬ ШОКОЛАДА
30003,Кто из этих бодибилдеров играл Халка
30004,"Последняя на данный момент страна, официально происоединившаяся к Европейскому союзу"
30005,Как зовут лютоволка Арьи Старк из сериала «Игра престолов»?


In [20]:
train = text_data.iloc[:30000]

In [21]:
train.shape

(30000, 1)

In [22]:
train.head()

Unnamed: 0_level_0,Question
ID,Unnamed: 1_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?
2,Как в химии обозначается свинец?
3,Какой химический элемент преобладает в составе каменного угля?
4,Кто из перечисленных был пажом во времена Екатерины II?
5,Когда началась 2 мировая война?


In [23]:
train.tail(2)

Unnamed: 0_level_0,Question
ID,Unnamed: 1_level_1
29999,Сколько лет пролежал на печи известный богатырь Илья Муромец?
30000,В каком году основали Московский Государственный Университет имени М.В. Ломоносова?


In [24]:
train = pd.concat([train, train_id_answer['Answer']], axis=1)
train.head()

Unnamed: 0_level_0,Question,Answer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,0
2,Как в химии обозначается свинец?,1
3,Какой химический элемент преобладает в составе каменного угля?,0
4,Кто из перечисленных был пажом во времена Екатерины II?,0
5,Когда началась 2 мировая война?,0


In [25]:
train.shape

(30000, 2)

In [26]:
train.dropna(inplace=True)

In [27]:
train.shape

(29999, 2)

# Feature engineering with second part of data

In [28]:
def comp(x):
    if x == ' ' or x == '.' or x == '!' or '?' or '.' or '"' or '»':
        return 0
    return 1

In [29]:
rus_quest_words = ['Кто', 'Что', 'Чем', 'Как', 'Где', 'Какое', 'Какой', 'Сколько',
                  'Когда', 'Чему', 'Какого', 'Какая', 'Кем', 'Какова', 'В', 'К', 'Во', 'Из',
                 'На', 'У', 'Кого', 'За']

In [30]:
def first_quest_word(x: str) -> str:
    x = x.split()[0]
    if x in rus_quest_words:
        return 1
    return 0

In [31]:
def data_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    regexp = re.compile(u'[A-ZА-ЯЁ]{3}')

    df['Last_symbol'] = df['Question'].apply(lambda x: x[-1])
    df['is_HE_in_question'] = df['Question'].apply(lambda x: 1 if re.search(' НЕ ', x) else 0)
    df['is_CAPS_in_question'] = df['Question'].apply(lambda x: 1 if re.search(regexp, x) else 0)
    df['is_HET_in_question'] = df['Question'].apply(lambda x: 1 if re.search(' НЕТ ', x) else 0)
    df['is_«_in_question'] = df['Question'].apply(lambda x: 1 if re.search('«', x) else 0)
    df['is_"_in_question'] = df['Question'].apply(lambda x: 1 if re.search('"', x) else 0)
    df['is_ё_in_question'] = df['Question'].apply(lambda x: 1 if re.search(u'ё', x) else 0)
    df['is_first_word_CAPS_in_question'] = df['Question'].apply(lambda x: 1 if x[0].isupper() == True else 0)
    df['is_)_in_question'] = df['Question'].apply(lambda x: 1 if re.search('\)', x) else 0)
    df['is_..._in_question'] = df['Question'].apply(lambda x: 1 if re.search(r'(\w+)\.{3,}', x) else 0)
    df['is_,_in_question'] = df['Question'].apply(lambda x: 1 if re.search(',', x) else 0)
    df['count_of_,_in_question'] = df['Question'].apply(lambda x: 1 if x.count(',') > 1 else 0)
    df['is_>_in_question'] = df['Question'].apply(lambda x: 1 if re.search('>', x) else 0)
    df['is_!_in_question'] = df['Question'].apply(lambda x: 1 if re.search('!', x) else 0)
    df['is_\n_in_question'] = df['Question'].apply(lambda x: 1 if re.search('\\n', x) else 0)
    df['is_.>=1_in_question'] = df['Question'].apply(lambda x: 1 if re.search(r'.{1,}', x) else 0)
    df['is_.__in_question'] = df['Question'].apply(lambda x: 1 if re.search(r'. ', x) else 0)
    df['is_(){}[]_in_question'] = df['Question'].apply(lambda x: 1 if re.search(r"[^(){}[\]]+", x) else 0)
    df['is__?_in_question'] = df['Question'].apply(lambda x: 1 if re.search(r' \?', x) else 0)
    df['is_.-_in_question'] = df['Question'].apply(lambda x: 1 if re.search(r'.-', x) else 0)
    df['Last_symbol_new'] = df['Last_symbol'].apply(comp)
    df['first_quest_word'] = df['Question'].apply(lambda x: 1 if x.split()[0] in rus_quest_words else 0)
    df['Last_symbol'] = df['Last_symbol'].apply(lambda x: 1 if x == '?' else 0)
    
    df['char_count'] = df['Question'].apply(len)
    df['word_count'] = df['Question'].apply(lambda x: len(x.split()))
    df['word_density'] = df['char_count'] / (df['word_count']+1)
    df['punctuation_count'] = df['Question'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
    df['title_word_count'] = df['Question'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
    df['upper_case_word_count'] = df['Question'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
    
    return df

In [32]:
train = data_preprocessing(train)
train.head(2)

Unnamed: 0_level_0,Question,Answer,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,62,9,6.2,1,2,0
2,Как в химии обозначается свинец?,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,32,5,5.333333,1,1,0


In [33]:
test = data_preprocessing(test)
test.head(2)

Unnamed: 0_level_0,Question,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
30001,"Что в литературе обозначает слово ""эпитет""?",1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,43,6,6.142857,3,1,0
30002,Когда отмечается ДЕНЬ ШОКОЛАДА,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,30,4,6.0,0,1,2


In [34]:
train.shape, test.shape

((29999, 30), (11087, 29))

# Target label

In [35]:
train_y_true = train['Answer']

# First and second parts of data concat

In [36]:
full_train = pd.concat([train, tmp_tmp.iloc[:29999]], axis=1)
full_train.head(2)

Unnamed: 0_level_0,Question,Answer,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Question,tokens_text,part_of_speech,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,62,9,6.2,1,2,0,Как зовут лодочника на реке Стикс в древнегреческой мифологии?,"['Как', 'зовут', 'лодочника', 'реке', 'Стикс', 'древнегреческой', 'мифологии', '?']","{'CONJ': 1, 'VERB': 1, 'NOUN': 4, 'PREP': 2, 'ADJF': 1, None: 1}",4,1,0,0,1,0,0,0,0,0,0,0,0,2,1,0,0,1
2,Как в химии обозначается свинец?,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,32,5,5.333333,1,1,0,Как в химии обозначается свинец?,"['Как', 'химии', 'обозначается', 'свинец', '?']","{'CONJ': 1, 'PREP': 1, 'NOUN': 2, 'VERB': 1, None: 1}",2,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1


In [37]:
full_train.shape

(29999, 51)

In [38]:
full_train.tail(2)

Unnamed: 0_level_0,Question,Answer,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Question,tokens_text,part_of_speech,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
29999,Сколько лет пролежал на печи известный богатырь Илья Муромец?,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,61,9,6.1,1,3,0,Сколько лет пролежал на печи известный богатырь Илья Муромец?,"['Сколько', 'лет', 'пролежал', 'печи', 'известный', 'богатырь', 'Илья', 'Муромец', '?']","{'CONJ': 1, 'NOUN': 5, 'VERB': 1, 'PREP': 1, 'ADJF': 1, None: 1}",5,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1
30000,В каком году основали Московский Государственный Университет имени М.В. Ломоносова?,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,84,10,7.636364,3,6,2,В каком году основали Московский Государственный Университет имени М.В. Ломоносова?,"['В', 'каком', 'году', 'основали', 'Московский', 'Государственный', 'Университет', 'имени', 'М', '.', 'В', '.', 'Ломоносова', '?']","{'PREP': 2, 'ADJF': 3, 'NOUN': 5, 'VERB': 1, None: 3}",5,3,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,3


In [39]:
full_test = pd.concat([test, tmp_tmp.iloc[29999:]], axis=1)
full_test.head(2)

Unnamed: 0_level_0,Question,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Question,tokens_text,part_of_speech,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
30001,"Что в литературе обозначает слово ""эпитет""?",1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,43,6,6.142857,3,1,0,"Что в литературе обозначает слово ""эпитет""?","['Что', 'литературе', 'обозначает', 'слово', '""', 'эпитет', '""?']","{'CONJ': 1, 'PREP': 1, 'NOUN': 3, 'VERB': 1, None: 2}",3,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,2
30002,Когда отмечается ДЕНЬ ШОКОЛАДА,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,30,4,6.0,0,1,2,Когда отмечается ДЕНЬ ШОКОЛАДА,"['Когда', 'отмечается', 'ДЕНЬ', 'ШОКОЛАДА']","{'CONJ': 1, 'VERB': 1, 'NOUN': 2}",2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [40]:
full_test.tail(2)

Unnamed: 0_level_0,Question,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Question,tokens_text,part_of_speech,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
41086,Сколько раз магнитогорский металлург становился обладателем кубка гагарина,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,74,8,8.222222,0,1,0,Сколько раз магнитогорский металлург становился обладателем кубка гагарина,"['Сколько', 'магнитогорский', 'металлург', 'становился', 'обладателем', 'кубка', 'гагарина']","{'CONJ': 1, 'NOUN': 5, 'ADJF': 1, 'VERB': 1}",5,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
41087,Какая численность людей в 2018 году?,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,36,6,5.142857,1,1,0,Какая численность людей в 2018 году?,"['Какая', 'численность', 'людей', '2018', 'году', '?']","{'ADJF': 1, 'NOUN': 3, 'PREP': 1, None: 2}",3,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2


In [41]:
full_test.shape, full_train.shape

((11087, 50), (29999, 51))

# Data drop columns

In [42]:
full_train.drop('Answer', inplace=True, axis=1)

In [43]:
train_y_true.shape

(29999,)

In [44]:
train_full = full_train.drop(['Question', 'tokens_text', 'part_of_speech'], axis=1)
test_full = full_test.drop(['Question', 'tokens_text', 'part_of_speech'], axis=1)

In [45]:
train_full.head()

Unnamed: 0_level_0,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,62,9,6.2,1,2,0,4,1,0,0,1,0,0,0,0,0,0,0,0,2,1,0,0,1
2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,32,5,5.333333,1,1,0,2,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1
3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,62,8,6.888889,1,1,0,3,3,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
4,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,55,9,5.5,1,2,1,3,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,2
5,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,31,5,5.166667,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,2


In [46]:
test_full.head()

Unnamed: 0_level_0,Last_symbol,is_HE_in_question,is_CAPS_in_question,is_HET_in_question,is_«_in_question,"is_""_in_question",is_ё_in_question,is_first_word_CAPS_in_question,is_)_in_question,is_..._in_question,"is_,_in_question","count_of_,_in_question",is_>_in_question,is_!_in_question,is_ _in_question,is_.>=1_in_question,is_.__in_question,is_(){}[]_in_question,is__?_in_question,is_.-_in_question,Last_symbol_new,first_quest_word,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ,None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
30001,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,43,6,6.142857,3,1,0,3,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,2
30002,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,30,4,6.0,0,1,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
30003,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,36,6,5.142857,0,2,0,2,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
30004,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,1,0,0,0,0,84,10,7.636364,1,2,0,3,3,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,1
30005,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,60,9,6.0,1,4,0,6,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,2


In [47]:
full_test.shape, full_train.shape

((11087, 50), (29999, 50))

In [48]:
train_full = full_train.drop(['Question', 'tokens_text', 'part_of_speech'], axis=1)
test_full = full_test.drop(['Question', 'tokens_text', 'part_of_speech'], axis=1)

# Data stemming

In [49]:
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from string import punctuation

nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

trash_symbols = ['«',  '"?', '»?', '?', '»', '\/', '...', '-', '(', ')', '>', 
                 '???', '?????', '??', '????', '..', '?(', ')?',  '..."', '>>',
                '<<', '?!', 'г',  'кг', '|||', '||', '|', '!»', '.?', ',,?',
                 '»,', '!»', '...»', '“', '”', '.>>?', ',,', '\'', '\n', '...?',
                 '?"', '.)', '".', '!!!!', '!!', '!!!', '....', ':"', '\/?',
                 '(-', ':?', '):']
def tokenize_text(text):
    tokens = nltk.wordpunct_tokenize(text)
    stemer = SnowballStemmer('russian')
    tokens = [stemer.stem(token) for token in tokens if token not in russian_stopwords\
              and token != " " and token.strip() not in punctuation and token not in trash_symbols]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
preproc_Question_train = train['Question'].progress_apply(tokenize_text)

100%|██████████████████████████████████████████████████████████████████████████| 29999/29999 [00:09<00:00, 3177.02it/s]


In [51]:
preproc_Question_train.head(2)

ID
1    как зовут лодочник рек стикс древнегреческ мифолог
2    как хим обознача свинец                           
Name: Question, dtype: object

In [52]:
preproc_Question_test = test['Question'].progress_apply(tokenize_text)

100%|██████████████████████████████████████████████████████████████████████████| 11087/11087 [00:03<00:00, 3178.77it/s]


In [53]:
preproc_Question_test.head(2)

ID
30001    что литератур обознача слов эпитет
30002    когд отмеча ден шоколад           
Name: Question, dtype: object

# CountVectorize

In [54]:
vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', max_df=1.0, min_df=1)
X_train = vectorizer.fit_transform(preproc_Question_train )
X_test = vectorizer.transform(preproc_Question_test)

In [55]:
X_train.shape, X_test.shape

((29999, 18932), (11087, 18932))

In [56]:
ttrain = hstack((train_full.values, X_train))
ttest = hstack((test_full.values, X_test))
ttrain.shape, ttest.shape

((29999, 18978), (11087, 18978))

# Stacking

## Meta-model 

In [57]:
logreg = LogisticRegression(C=0.1, penalty="l2")

## Models (10 xgboost and 5 catboost classifiers)

### Xgboost classifiers

In [58]:
xgb_1 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=42,
                              n_estimator=129)

In [59]:
xgb_2 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=542,
                              n_estimator=129)

In [60]:
xgb_3 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=1342,
                              n_estimator=129)

In [61]:
xgb_4 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=2342,
                              n_estimator=129)

In [62]:
xgb_5 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=1,
                              n_estimator=129)

In [63]:
xgb_6 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=12342,
                              n_estimator=129)

In [64]:
xgb_7 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=22342,
                              n_estimator=129)

In [65]:
xgb_8 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=23,
                              n_estimator=129)

In [66]:
xgb_9 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=0,
                              n_estimator=129)

In [67]:
xgb_10 = xgboost.XGBClassifier(max_depth=5,
                              learning_rate=0.1, 
                              nthread=6,
                              subsample=0.5,
                              colsample_bytree=0.7,
                              gamma=0.5,
                              n_jobs=-1,
                              seed=1111111110,
                              n_estimator=129)

### Catboost classifiers

In [68]:
ctb_1 = CatBoostClassifier(iterations=158,
                           learning_rate=0.1,
                           random_seed=42,
                           depth=5,
                           task_type="GPU",
                           custom_loss='AUC'
                          )

In [69]:
ctb_2 = CatBoostClassifier(iterations=158,
                           learning_rate=0.1,
                           random_seed=542,
                           depth=5,
                           task_type="GPU",
                           custom_loss='AUC'
                          )

In [70]:
ctb_3 = CatBoostClassifier(iterations=158,
                           learning_rate=0.1,
                           random_seed=1542,
                           depth=5,
                           task_type="GPU",
                           custom_loss='AUC'
                          )

In [71]:
ctb_4 = CatBoostClassifier(iterations=158,
                           learning_rate=0.1,
                           random_seed=221542,
                           depth=5,
                           task_type="GPU",
                           custom_loss='AUC'
                          )

In [72]:
ctb_5 = CatBoostClassifier(iterations=158,
                           learning_rate=0.1,
                           random_seed=156565542,
                           depth=5,
                           task_type="GPU",
                           custom_loss='AUC'
                          )

### Stacking  

In [74]:
sclf = StackingClassifier(classifiers=[xgb_1, xgb_2, xgb_3, xgb_4, xgb_5, xgb_6, xgb_7, xgb_8, xgb_9, xgb_10,
                                      ctb_1, ctb_2, ctb_3, ctb_4, ctb_5],
                          meta_classifier=logreg,
                          use_probas=True)

### Cross-validation 

In [75]:
scores = model_selection.cross_val_score(sclf,
                                         ttrain,
                                         train_y_true,
                                         cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
                                         scoring='roc_auc',
                                         n_jobs=-1,
                                         verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.3min finished


### Results of cross-validation

In [79]:
print('ROC-AUC of cross-validation on 10 folds: {}'.format(scores))

ROC-AUC of cross-validation on 10 folds: [0.79341584 0.81413653 0.81320503 0.81992259 0.80509708 0.80869331
 0.80450747 0.81758034 0.80071127 0.79639651]


In [80]:
print('Mean ROC-AUC of cross-validation: {}'.format(scores.mean()))

Mean ROC-AUC of cross-validation: 0.8073665986338021


### Fitting stacking

In [81]:
sclf.fit(ttrain, train_y_true)

0:	learn: 0.5694170	total: 14.7ms	remaining: 2.3s
1:	learn: 0.5119787	total: 30.3ms	remaining: 2.36s
2:	learn: 0.4637319	total: 44.2ms	remaining: 2.29s
3:	learn: 0.4083451	total: 59.5ms	remaining: 2.29s
4:	learn: 0.3749397	total: 76.8ms	remaining: 2.35s
5:	learn: 0.3520221	total: 92.4ms	remaining: 2.34s
6:	learn: 0.3361845	total: 109ms	remaining: 2.34s
7:	learn: 0.3234516	total: 125ms	remaining: 2.35s
8:	learn: 0.3142091	total: 141ms	remaining: 2.33s
9:	learn: 0.3074911	total: 157ms	remaining: 2.33s
10:	learn: 0.3020737	total: 174ms	remaining: 2.32s
11:	learn: 0.2975509	total: 189ms	remaining: 2.29s
12:	learn: 0.2940978	total: 205ms	remaining: 2.29s
13:	learn: 0.2906676	total: 220ms	remaining: 2.26s
14:	learn: 0.2882050	total: 237ms	remaining: 2.26s
15:	learn: 0.2862402	total: 254ms	remaining: 2.26s
16:	learn: 0.2848813	total: 271ms	remaining: 2.25s
17:	learn: 0.2837964	total: 285ms	remaining: 2.22s
18:	learn: 0.2828697	total: 302ms	remaining: 2.21s
19:	learn: 0.2813630	total: 317ms	re

2:	learn: 0.4608741	total: 40.8ms	remaining: 2.1s
3:	learn: 0.4233200	total: 57.4ms	remaining: 2.21s
4:	learn: 0.3828823	total: 71.6ms	remaining: 2.19s
5:	learn: 0.3562252	total: 88.6ms	remaining: 2.24s
6:	learn: 0.3379202	total: 103ms	remaining: 2.22s
7:	learn: 0.3248365	total: 119ms	remaining: 2.22s
8:	learn: 0.3152407	total: 135ms	remaining: 2.23s
9:	learn: 0.3080124	total: 152ms	remaining: 2.25s
10:	learn: 0.3020359	total: 170ms	remaining: 2.27s
11:	learn: 0.2971246	total: 185ms	remaining: 2.25s
12:	learn: 0.2932790	total: 203ms	remaining: 2.27s
13:	learn: 0.2905513	total: 220ms	remaining: 2.26s
14:	learn: 0.2882989	total: 235ms	remaining: 2.24s
15:	learn: 0.2866844	total: 251ms	remaining: 2.23s
16:	learn: 0.2851034	total: 268ms	remaining: 2.22s
17:	learn: 0.2838425	total: 283ms	remaining: 2.2s
18:	learn: 0.2826141	total: 299ms	remaining: 2.19s
19:	learn: 0.2817755	total: 315ms	remaining: 2.17s
20:	learn: 0.2811837	total: 328ms	remaining: 2.14s
21:	learn: 0.2806814	total: 341ms	rem

4:	learn: 0.3851679	total: 71.5ms	remaining: 2.19s
5:	learn: 0.3612770	total: 85.8ms	remaining: 2.17s
6:	learn: 0.3414588	total: 101ms	remaining: 2.18s
7:	learn: 0.3266520	total: 118ms	remaining: 2.2s
8:	learn: 0.3164693	total: 134ms	remaining: 2.22s
9:	learn: 0.3077890	total: 149ms	remaining: 2.2s
10:	learn: 0.3021266	total: 165ms	remaining: 2.21s
11:	learn: 0.2982464	total: 180ms	remaining: 2.19s
12:	learn: 0.2947532	total: 197ms	remaining: 2.2s
13:	learn: 0.2917659	total: 217ms	remaining: 2.23s
14:	learn: 0.2893082	total: 234ms	remaining: 2.23s
15:	learn: 0.2876034	total: 249ms	remaining: 2.21s
16:	learn: 0.2861800	total: 263ms	remaining: 2.19s
17:	learn: 0.2848378	total: 280ms	remaining: 2.18s
18:	learn: 0.2837124	total: 294ms	remaining: 2.15s
19:	learn: 0.2828272	total: 309ms	remaining: 2.13s
20:	learn: 0.2815425	total: 324ms	remaining: 2.11s
21:	learn: 0.2806464	total: 340ms	remaining: 2.1s
22:	learn: 0.2798809	total: 353ms	remaining: 2.07s
23:	learn: 0.2795550	total: 366ms	remai

7:	learn: 0.3483716	total: 116ms	remaining: 2.18s
8:	learn: 0.3281104	total: 133ms	remaining: 2.2s
9:	learn: 0.3153213	total: 148ms	remaining: 2.19s
10:	learn: 0.3055478	total: 165ms	remaining: 2.21s
11:	learn: 0.3019818	total: 180ms	remaining: 2.19s
12:	learn: 0.2966378	total: 196ms	remaining: 2.19s
13:	learn: 0.2930205	total: 212ms	remaining: 2.18s
14:	learn: 0.2900908	total: 227ms	remaining: 2.17s
15:	learn: 0.2878977	total: 244ms	remaining: 2.17s
16:	learn: 0.2859474	total: 261ms	remaining: 2.16s
17:	learn: 0.2847613	total: 275ms	remaining: 2.14s
18:	learn: 0.2832469	total: 289ms	remaining: 2.11s
19:	learn: 0.2821987	total: 305ms	remaining: 2.11s
20:	learn: 0.2812965	total: 320ms	remaining: 2.09s
21:	learn: 0.2804568	total: 335ms	remaining: 2.07s
22:	learn: 0.2799284	total: 350ms	remaining: 2.06s
23:	learn: 0.2790646	total: 366ms	remaining: 2.04s
24:	learn: 0.2786133	total: 380ms	remaining: 2.02s
25:	learn: 0.2784076	total: 393ms	remaining: 1.99s
26:	learn: 0.2781749	total: 407ms	r

9:	learn: 0.3089464	total: 154ms	remaining: 2.29s
10:	learn: 0.3033725	total: 170ms	remaining: 2.28s
11:	learn: 0.2988997	total: 185ms	remaining: 2.25s
12:	learn: 0.2951430	total: 202ms	remaining: 2.25s
13:	learn: 0.2918941	total: 218ms	remaining: 2.24s
14:	learn: 0.2888879	total: 234ms	remaining: 2.23s
15:	learn: 0.2872229	total: 249ms	remaining: 2.21s
16:	learn: 0.2855600	total: 266ms	remaining: 2.21s
17:	learn: 0.2841981	total: 283ms	remaining: 2.2s
18:	learn: 0.2830978	total: 301ms	remaining: 2.2s
19:	learn: 0.2823353	total: 316ms	remaining: 2.18s
20:	learn: 0.2809004	total: 332ms	remaining: 2.16s
21:	learn: 0.2798411	total: 348ms	remaining: 2.15s
22:	learn: 0.2793169	total: 362ms	remaining: 2.12s
23:	learn: 0.2789011	total: 377ms	remaining: 2.11s
24:	learn: 0.2783555	total: 391ms	remaining: 2.08s
25:	learn: 0.2775157	total: 407ms	remaining: 2.07s
26:	learn: 0.2771275	total: 421ms	remaining: 2.04s
27:	learn: 0.2766143	total: 438ms	remaining: 2.03s
28:	learn: 0.2764727	total: 450ms	

StackingClassifier(average_probas=False,
                   classifiers=[XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=0.7, gamma=0.5,
                                              learning_rate=0.1,
                                              max_delta_step=0, max_depth=5,
                                              min_child_weight=1, missing=None,
                                              n_estimator=129, n_estimators=100,
                                              n_jobs=-1, nthread=6,
                                              objective='binary:logistic',
                                              random_state=0, reg_al...
                   meta_classifier=LogisticRegression(C=0.1, class_weight=None,
                                                      dual=False,
   

### Create predictions 

In [82]:
pred = sclf.predict_proba(ttest)

# Create a dataframe of submission and dump to ".csv"

In [83]:
submission = pd.DataFrame({'ID': test_id['ID'], 'y': pred[:, 1]})
submission.head()

Unnamed: 0,ID,y
0,30001,0.029728
1,30002,0.02739
2,30003,0.027381
3,30004,0.02841
4,30005,0.878109


In [84]:
submission.to_csv('xgboost_catboost_stacking.csv', index=False, header=False)