In [1]:
import pandas as pd, numpy as np
from catboost import CatBoostClassifier
import pickle
import sys
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


# Load Train Data and Labels

In [2]:
dtypes = {"session_id": 'int64',
          "index": np.int16,
          "elapsed_time": np.int32,
          "event_name": 'category',
          "name": 'category',
          "level": np.int8,
          "page": np.float16,
          "room_coor_x": np.float16,
          "room_coor_y": np.float16,
          "screen_coor_x": np.float16,
          "screen_coor_y": np.float16,
          "hover_duration": np.float32,
          "text": 'category',
          "fqid": 'category',
          "room_fqid": 'category',
          "text_fqid": 'category',
          "fullscreen": np.int8,
          "hq": np.int8,
          "music": np.int8,
          "level_group": 'category'
          }
use_col = ['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level', 'page',
           'room_coor_x', 'room_coor_y', 'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid', 'level_group']

In [3]:
targets = pd.read_csv('./Data/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head()

(424116, 4)


Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [4]:
feature_df = pd.read_csv('./CatVersionData/feature_sort.csv')

# Feature Engineer

In [5]:
def delt_time_def(df):
    df.sort_values(by=['session_id', 'elapsed_time'], inplace=True)
    df['d_time'] = df['elapsed_time'].diff(1)
    df['d_time'].fillna(0, inplace=True)
    df['delt_time'] = df['d_time'].clip(0, 103000)
    df['delt_time_next'] = df['delt_time'].shift(-1)
    return df

In [6]:
def feature_engineer(train, kol_f):
    global kol_col, kol_col_max
    kol_col = 9
    kol_col_max = 11+kol_f*2
    col = [i for i in range(0,kol_col_max)]
    new_train = pd.DataFrame(index=train['session_id'].unique(), columns=col, dtype=np.float16)  
    new_train[10] = new_train.index # "session_id"    

    new_train[0] = train.groupby(['session_id'])['d_time'].quantile(q=0.3)
    new_train[1] = train.groupby(['session_id'])['d_time'].quantile(q=0.8)
    new_train[2] = train.groupby(['session_id'])['d_time'].quantile(q=0.5)
    new_train[3] = train.groupby(['session_id'])['d_time'].quantile(q=0.65)
    new_train[4] = train.groupby(['session_id'])['hover_duration'].agg('mean')
    new_train[5] = train.groupby(['session_id'])['hover_duration'].agg('std')    
    new_train[6] = new_train[10].apply(lambda x: int(str(x)[:2])).astype(np.uint8) # "year"
    new_train[7] = new_train[10].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8) # "month"
    new_train[8] = new_train[10].apply(lambda x: int(str(x)[4:6])).astype(np.uint8) # "day"
    new_train[9] = new_train[10].apply(lambda x: int(str(x)[6:8])).astype(np.uint8) + new_train[10].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)/60
    new_train[10] = 0
    new_train = new_train.fillna(-1)
    
    return new_train

In [7]:
def feature_next_t(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    col1 = row_f['col1']
    val1 = row_f['val1']
    maska = (train[col1] == val1)
    if row_f['kol_col'] == 1:       
        new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = maska & (train[col2] == val2)        
        new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['index'].count()
    return new_train

In [8]:
def feature_next_t_otvet(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    col1 = row_f['col1']
    val1 = row_f['val1']
    maska = (train[col1] == val1)
    if row_f['kol_col'] == 1:      
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = maska & (train[col2] == val2)        
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()
    return new_train

In [9]:
def experiment_feature_next_t_otvet(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    if row_f['kol_col'] == 1: 
        maska = train[row_f['col1']] == row_f['val1']
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = (train[col1] == val1) & (train[col2] == val2)        
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()
    return new_train

In [10]:
def feature_quest_otvet(new_train, train, quest, kol_f):
    global kol_col
    kol_col = 9
    g1 = 0.7 
    g2 = 0.3 

    feature_q = feature_df[feature_df['quest'] == quest].copy()
    feature_q.reset_index(drop=True, inplace=True)
    
    gran1 = round(kol_f * g1)
    gran2 = round(kol_f * g2)    
    for i in range(0, kol_f):         
        row_f = feature_q.loc[i]
        new_train = feature_next_t_otvet(row_f, new_train, train, i < gran1, i <  gran2, i) 
    col = [i for i in range(0,kol_col+1)]
    return new_train[col]

In [11]:
def feature_engineer_new(new_train, train, feature_q, kol_f):
    g1 = 0.7 
    g2 = 0.3 
    gran1 = round(kol_f * g1)
    gran2 = round(kol_f * g2)    
    for i in range(0, kol_f): 
        row_f = feature_q.loc[i]       
        new_train = feature_next_t(row_f, new_train, train, i < gran1, i <  gran2, i)         
    return new_train

In [12]:
def feature_quest(new_train, train, quest, kol_f):
    global kol_col
    kol_col = 9
    feature_q = feature_df[feature_df['quest'] == quest].copy()
    feature_q.reset_index(drop=True, inplace=True)
    new_train = feature_engineer_new(new_train, train, feature_q, kol_f)
    col = [i for i in range(0,kol_col+1)]
    return new_train[col]

In [13]:
def create_model(old_train, quests, models, list_kol_f):
    
    kol_quest = len(quests)
    # ITERATE THRU QUESTIONS
    for q in quests:
        print('### quest ', q, end='')
        new_train = feature_engineer(old_train, list_kol_f[q])
        train_x = feature_quest(new_train, old_train, q, list_kol_f[q])
        print (' ---- ', 'train_q.shape = ', train_x.shape)
           
        # TRAIN DATA
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==q].set_index('session').loc[train_users]
        
        train_ag = TabularDataset(train_x.join(train_y, how = 'left'))

        # TRAIN MODEL 
        predictor = TabularPredictor(label='correct').fit(train_ag.drop(['session_id','q'], axis = 1),  num_gpus=1,)

        # SAVE MODEL, PREDICT VALID OOF
        models[f'{q}'] = predictor
    print('***')
    
    return models

In [14]:
models = {}
best_threshold = 0.63

In [15]:
list_kol_f = {
    1:140,3:110,
    4:120, 5:220, 6:130, 7:110, 8:110, 9:100, 10:140, 11:120,
    14: 160, 15:160, 16:130, 17:140             
             }

In [16]:
df0_4 = pd.read_csv('./CatVersionData/train_0_4t.csv', dtype=dtypes) 
kol_lvl = (df0_4 .groupby(['session_id'])['level'].agg('nunique') < 5)
list_session = kol_lvl[kol_lvl].index
df0_4  = df0_4 [~df0_4 ['session_id'].isin(list_session)]
df0_4 = delt_time_def(df0_4)

quests_0_4 = [1, 3] 
# list_kol_f = {1:140,3:110}

models = create_model(df0_4, quests_0_4, models, list_kol_f)
del df0_4

### quest  1

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_141511/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_141511/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23562
Train Data Columns: 290
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected cla

 ----  train_q.shape =  (23562, 290)


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 269 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  21 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 269 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  21 | ['6', '7', '8', '12', '15', ...]
	0.4s = Fit runtime
	290 features in original data used to generate 290 features in processed data.
	Train Data (Processed) Memory Usage: 54.07 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.5s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating trai

### quest  3

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_141620/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_141620/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23562
Train Data Columns: 230
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    26094.59 MB
	Train Data (

 ----  train_q.shape =  (23562, 230)


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Useless Original Features (Count: 15): ['215', '216', '217', '218', '219', '220', '221', '222', '223', '224', '225', '226', '227', '228', '229']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference time.
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 201 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  14 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 201 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  14 | ['6', '7', '8', '12', '15', ...]
	0.3s = Fit runtime
	215 features in original data used to generate 215 features in processed data.
	Train Dat

***


In [23]:
df5_12 = pd.read_csv('./CatVersionData/train_5_12t.csv', dtype=dtypes)
kol_lvl = (df5_12.groupby(['session_id'])['level'].agg('nunique') < 8)
list_session = kol_lvl[kol_lvl].index
df5_12 = df5_12[~df5_12['session_id'].isin(list_session)]
df5_12 = delt_time_def(df5_12)
quests_5_12 = [4, 5, 6, 7, 8, 9, 10, 11] 

# list_kol_f = {4:110, 5:220, 6:120, 7:110, 8:110, 9:100, 10:140, 11:120}

models = create_model(df5_12, quests_5_12, models, list_kol_f)
del df5_12

### quest  4

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144031/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144031/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 250
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    25594.74 MB
	Train Data (

 ----  train_q.shape =  (23561, 250)


			Note: Converting 3 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 219 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  31 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 216 | ['0', '1', '2', '3', '4', ...]
		('int', [])       :  31 | ['6', '7', '8', '12', '15', ...]
		('int', ['bool']) :   3 | ['49', '50', '51']
	0.4s = Fit runtime
	250 features in original data used to generate 250 features in processed data.
	Train Data (Processed) Memory Usage: 46.04 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.42s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change

### quest  5

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144139/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144139/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 450
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected cla

 ----  train_q.shape =  (23561, 450)


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 391 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  59 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 391 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  59 | ['6', '7', '8', '12', '15', ...]
	0.6s = Fit runtime
	450 features in original data used to generate 450 features in processed data.
	Train Data (Processed) Memory Usage: 84.23 MB (0.3% of available memory)
Data preprocessing and feature engineering runtime = 0.7s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating trai

### quest  6

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144306/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144306/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 270
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected cla

 ----  train_q.shape =  (23561, 270)


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 234 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  36 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 234 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  36 | ['6', '7', '8', '12', '15', ...]
	0.4s = Fit runtime
	270 features in original data used to generate 270 features in processed data.
	Train Data (Processed) Memory Usage: 50.3 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.46s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating trai

### quest  7

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144413/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144413/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 230
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    25455.34 MB
	Train Data (

 ----  train_q.shape =  (23561, 230)


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 201 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  29 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 201 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  29 | ['6', '7', '8', '12', '15', ...]
	0.3s = Fit runtime
	230 features in original data used to generate 230 features in processed data.
	Train Data (Processed) Memory Usage: 42.76 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.4s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 212

### quest  8

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144522/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144522/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 230
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    25388.38 MB
	Train Data (

 ----  train_q.shape =  (23561, 230)


			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 202 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  28 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 198 | ['0', '1', '2', '3', '4', ...]
		('int', [])       :  28 | ['6', '7', '8', '12', '15', ...]
		('int', ['bool']) :   4 | ['41', '42', '45', '48']
	0.4s = Fit runtime
	230 features in original data used to generate 230 features in processed data.
	Train Data (Processed) Memory Usage: 42.1 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.43s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To c

### quest  9

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144627/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144627/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 210
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    25240.21 MB
	Train Data (

 ----  train_q.shape =  (23561, 210)


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 183 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  27 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 183 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  27 | ['6', '7', '8', '12', '15', ...]
	0.3s = Fit runtime
	210 features in original data used to generate 210 features in processed data.
	Train Data (Processed) Memory Usage: 38.99 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.37s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 21

### quest  10

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144731/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144731/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 290
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected cla

 ----  train_q.shape =  (23561, 290)


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 3 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Useless Original Features (Count: 3): ['287', '288', '289']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference time.
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 248 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  39 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 245 | ['0', '1', '2', '3', '4', ...]
		('int', [])       :  39 | ['6', '7', '8', '12', '15', ...]
		('int', ['bool']) :   3 | ['281', '282', '

### quest  11

No path specified. Models will be saved in: "AutogluonModels/ag-20230619_144843/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230619_144843/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #82-Ubuntu SMP Tue Jun 6 23:10:23 UTC 2023
Train Data Rows:    23561
Train Data Columns: 250
Label Column: correct
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    25174.58 MB
	Train Data (

 ----  train_q.shape =  (23561, 250)


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 223 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  27 | ['6', '7', '8', '12', '15', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 223 | ['0', '1', '2', '3', '4', ...]
		('int', [])   :  27 | ['6', '7', '8', '12', '15', ...]
	0.4s = Fit runtime
	250 features in original data used to generate 250 features in processed data.
	Train Data (Processed) Memory Usage: 46.53 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.43s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating tra

***


In [18]:
df13_22 = pd.read_csv('./CatVersionData/train_13_22t.csv', dtype=dtypes) 
kol_lvl = (df13_22 .groupby(['session_id'])['level'].agg('nunique') < 10)
list_session = kol_lvl[kol_lvl].index
df13_22  = df13_22 [~df13_22 ['session_id'].isin(list_session)]
df13_22 = delt_time_def(df13_22)

quests_13_22 = [14, 15, 16, 17] 
# list_kol_f = {14: 160, 15:160, 16:105, 17:140}

models = create_model(df13_22, quests_13_22, models, list_kol_f)


### quest  14 ----  train_q.shape =  (22986, 330)
### quest  15 ----  train_q.shape =  (22986, 330)
### quest  16 ----  train_q.shape =  (22986, 270)
### quest  17 ----  train_q.shape =  (22986, 290)
***


In [19]:
#Saving a Model
for q in quests_0_4 + quests_5_12 + quests_13_22:
    models[q].save_model(f'cat_model_{q}.bin')

In [20]:
#Model Reading
dir = '/kaggle/input/catbust/'
for q in quests_0_4 + quests_5_12 + quests_13_22:
     models[q] = CatBoostClassifier().load_model(dir+f'cat_model_{q}.bin')

**Infer Test Data**

In [21]:
import jo_wilder

try:
    jo_wilder.make_env.__called__ = False
    env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
except:
    pass

env = jo_wilder.make_env()
iter_test = env.iter_test()    

In [22]:
import time

In [23]:
g_end4 = 0
g_end5 = 0

list_q = {'0-4':quests_0_4, '5-12':quests_5_12, '13-22':quests_13_22}
for (test, sam_sub) in iter_test:
    sam_sub['question'] = [int(label.split('_')[1][1:]) for label in sam_sub['session_id']]    
    grp = test.level_group.values[0]   
    sam_sub['correct'] = 1
    sam_sub.loc[sam_sub.question.isin([5, 8, 10, 13, 15]), 'correct'] = 0  
    old_train = delt_time_def(test[test.level_group == grp])
       
    for q in list_q[grp]:
        
        start4 = time.time()
        new_train = feature_engineer(old_train, list_kol_f[q])
        new_train = feature_quest_otvet(new_train, old_train, q, list_kol_f[q])
#         new_train = feature_quest(new_train, old_train, q, kol_f)
        
        end4 = time.time() - start4
        g_end4 += end4
        
        start5 = time.time()        
        
        clf = models[f'{q}']
        p = clf.predict_proba(new_train.astype('float32'))[:,1]        
        
        end5 = time.time() - start5
        g_end5 += end5
             
        
        mask = sam_sub.question == q 
        x = int(p[0]>best_threshold)
        sam_sub.loc[mask,'correct'] = x      
        
        
    sam_sub = sam_sub[['session_id', 'correct']]      
    env.predict(sam_sub)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


# EDA submission.csv

In [24]:
# df = pd.read_csv('submission.csv')
# print( df.shape )
# df.head(60)