In [3]:
import numpy as np
import pandas as pd
import os
import glob
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

In [4]:
train = pd.read_feather('../input/radiant/train_all_2.ftr')

# LGB

In [None]:
cols = list(corr_train[(corr_train > 0.1)|(corr_train < -0.1)].index)
cols = [x for x in cols if 'nearest' not in x and 'dist' not in x and x != 'x' and x != 'y']
print(len(cols))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train[train.columns[2:]], 
                                                  train['label'], 
                                                  stratify=train['label'], 
                                                  test_size=0.15)

In [None]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train, eval_set=(X_val, y_val))

In [None]:
print(classification_report(y_val, clf.predict(X_val)))

# SKF nogeo

In [None]:
cols_no_geo = list(corr_train[(corr_train > 0.075)|(corr_train < -0.075)].index)
cols_no_geo = [x for x in cols_no_geo if 'nearest' not in x and 'dist' not in x and x != 'x' and x != 'y']
print(len(cols_no_geo))
# cat_features = [x for x in train_columns if 'nearest' in x and 'count' not in x]
# print(len(cat_features))

In [None]:
models_nogeo = []
k = 0

for train_index, val_index in StratifiedKFold(n_splits=5).split(train, train['label']):
    
    X_train, X_val = train[cols_no_geo].iloc[train_index], train[cols_no_geo].iloc[val_index]
    y_train, y_val = train['label'].iloc[train_index], train['label'].iloc[val_index]
    
    model = CatBoostClassifier(task_type='GPU', verbose=100, early_stopping_rounds=250,
                               iterations=5000, learning_rate=0.15)
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    models_nogeo.append(model)

    model.save_model('3nogeo{}'.format(k))
    k += 1

# SKF geo

In [None]:
cols_geo = list(corr_train[(corr_train > 0.075)|(corr_train < -0.075)].index)
print(len(cols_geo))
# cat_features = [x for x in train_columns if 'nearest' in x and 'count' not in x]
# print(len(cat_features))

In [None]:
models_geo = []
k = 0

for train_index, val_index in StratifiedKFold(n_splits=5).split(train, train['label']):
    
    X_train, X_val = train[cols_geo].iloc[train_index], train[cols_geo].iloc[val_index]
    y_train, y_val = train['label'].iloc[train_index], train['label'].iloc[val_index]
    
    model = CatBoostClassifier(task_type='GPU', verbose=100, early_stopping_rounds=250,
                               iterations=5000, learning_rate=0.15)
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    models_geo.append(model)

    model.save_model('3geo{}'.format(k))
    k += 1

# LAMA

In [None]:
corr_train = np.abs(train[train.columns[2:]].corrwith(train['label']))
cols = ['field_id', 'label']
_ = [cols.append(x) for x in list(corr_train[corr_train > 0.178].index)]
# print(len(cols))
train = train[cols]
train.shape

In [None]:
task = Task('multiclass', loss = 'crossentropy', metric = 'crossentropy')

roles = {
    'target': 'label',
    'drop': 'field_id'
}

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 1 * 60 * 60 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
#                                lgb_params = {'default_params': {'device': 'gpu'}},
                               cb_params = {'default_params': {'task_type': 'GPU'}},
                               general_params={'use_algos': [['cb', 'lgb', 'cb_tuned', 'lgb_tuned']]},
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train, roles = roles)

In [None]:
test = pd.read_feather('../input/radiant/test_all_2.ftr')

In [None]:
preds = automl.predict(test).data

In [None]:
class_map = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping

In [None]:
class_map

In [None]:
preds_ = np.zeros(preds.shape)

In [None]:
for lab in class_map:
    preds_[:, lab-1] = preds[:, class_map[lab]]

In [None]:
sub = pd.read_csv('../input/radiant/SampleSubmission.csv')

pred_sub = pd.DataFrame()
pred_sub['Field ID'] = test.field_id

pred_sub[sub.columns[1:]] = preds_

pred_sub = pd.merge(sub['Field ID'], pred_sub, on=['Field ID'], how='left')

In [None]:
pred_sub

In [None]:
pred_sub.to_csv('sub_lama_3.csv', index=False)

# LAMA NOGEO

In [None]:
corr_train = np.abs(train[train.columns[2:]].corrwith(train['label']))
cols = ['field_id', 'label']
_ = [cols.append(x) for x in list(corr_train[corr_train > 0.13].index)]
cols = [x for x in cols if 'nearest' not in x and 'dist' not in x and x != 'x' and x != 'y']

train = train[cols]
train.shape

In [None]:
task = Task('multiclass', loss = 'crossentropy', metric = 'crossentropy')

roles = {
    'target': 'label',
    'drop': 'field_id'
}

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 4 * 60 * 60 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
#                                lgb_params = {'default_params': {'device': 'gpu'}},
                               cb_params = {'default_params': {'task_type': 'GPU'}},
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train, roles = roles)

In [None]:
test = pd.read_feather('../input/radiant/test_all_2.ftr')

In [None]:
preds = automl.predict(test).data

In [None]:
class_map = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping

In [None]:
class_map

In [None]:
preds_ = np.zeros(preds.shape)

In [None]:
for lab in class_map:
    preds_[:, lab-1] = preds[:, class_map[lab]]

In [None]:
sub = pd.read_csv('../input/radiant/SampleSubmission.csv')

pred_sub = pd.DataFrame()
pred_sub['Field ID'] = test.field_id

pred_sub[sub.columns[1:]] = preds_

pred_sub = pd.merge(sub['Field ID'], pred_sub, on=['Field ID'], how='left')

In [None]:
pred_sub

In [None]:
pred_sub.to_csv('sub_lama_nogeo_2.csv', index=False)

# LAMA GEO WITH GOOD FEATURES

In [None]:
corr_train = np.abs(train[train.columns[2:1781]].corrwith(train['label']))
cols = ['field_id', 'label']
_ = [cols.append(x) for x in list(corr_train[corr_train > 0.15].index)]
# print(len(cols))
train = train[cols]
train.shape

In [None]:
task = Task('multiclass', loss = 'crossentropy', metric = 'crossentropy')

roles = {
    'target': 'label',
    'drop': 'field_id'
}

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 2 * 60 * 60 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
#                                lgb_params = {'default_params': {'device': 'gpu'}},
                               cb_params = {'default_params': {'task_type': 'GPU'}},
                               general_params={'use_algos': [['cb', 'lgb', 'cb_tuned', 'lgb_tuned']]},
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train, roles = roles)

In [None]:
test = pd.read_feather('../input/radiant/test_all_2.ftr')

In [None]:
preds = automl.predict(test).data

In [None]:
class_map = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping

In [None]:
class_map

In [None]:
preds_ = np.zeros(preds.shape)

In [None]:
for lab in class_map:
    preds_[:, lab-1] = preds[:, class_map[lab]]

In [None]:
sub = pd.read_csv('../input/radiant/SampleSubmission.csv')

pred_sub = pd.DataFrame()
pred_sub['Field ID'] = test.field_id

pred_sub[sub.columns[1:]] = preds_

pred_sub = pd.merge(sub['Field ID'], pred_sub, on=['Field ID'], how='left')

In [None]:
pred_sub

In [None]:
pred_sub.to_csv('sub_lama_4.csv', index=False)

# Golden features

In [5]:
train = pd.read_feather('../input/radiant/train_all_2.ftr')

In [8]:
corr_train = np.abs(train[train.columns[2:]].corrwith(train['label']))
cols = ['field_id', 'label']
_ = [cols.append(x) for x in list(corr_train[corr_train > 0.15].index)]
len(cols)
train = train[cols]

In [9]:
temp, _ = train_test_split(train, test_size=0.99, stratify=train['label'])
lab = temp['label'].reset_index(drop=True)
temp = temp[temp.columns[2:]].reset_index(drop=True)

In [10]:
temp.shape

Save features with corr more than 0.67

In [11]:
gold = pd.DataFrame()
corr_gold = pd.DataFrame()
cols = temp.columns

th = 0.4

for i in tqdm(range(27, len(cols))):
    
    gold_temp = temp[cols[i:]].apply(lambda x: x*x[0], axis=1)
    gold_temp.columns = ['multiply*{}*{}'.format(x, cols[i]) for x in gold_temp.columns]
    corr_gold_temp = np.abs(gold_temp.corrwith(lab)).sort_values(ascending=False)
    corr_gold = pd.concat([corr_gold, corr_gold_temp[corr_gold_temp > th]])
    gold[list(corr_gold_temp[corr_gold_temp > th].index)] \
        = gold_temp[list(corr_gold_temp[corr_gold_temp > th].index)]
    
    gold_temp = temp[cols[i:]].apply(lambda x: x/x[0], axis=1)
    gold_temp.columns = ['divide*{}*{}'.format(x, cols[i]) for x in gold_temp.columns]
    corr_gold_temp = np.abs(gold_temp.corrwith(lab)).sort_values(ascending=False)
    corr_gold = pd.concat([corr_gold, corr_gold_temp[corr_gold_temp > th]])
    gold[list(corr_gold_temp[corr_gold_temp > th].index)] \
        = gold_temp[list(corr_gold_temp[corr_gold_temp > th].index)]
    
    gold_temp = temp[cols[i:]].apply(lambda x: x+x[0], axis=1)
    gold_temp.columns = ['plus*{}*{}'.format(x, cols[i]) for x in gold_temp.columns]
    corr_gold_temp = np.abs(gold_temp.corrwith(lab)).sort_values(ascending=False)
    corr_gold = pd.concat([corr_gold, corr_gold_temp[corr_gold_temp > th]])
    gold[list(corr_gold_temp[corr_gold_temp > th].index)] \
        = gold_temp[list(corr_gold_temp[corr_gold_temp > th].index)]
    
    gold_temp = temp[cols[i:]].apply(lambda x: x-x[0], axis=1)
    gold_temp.columns = ['minus*{}*{}'.format(x, cols[i]) for x in gold_temp.columns]
    corr_gold_temp = np.abs(gold_temp.corrwith(lab)).sort_values(ascending=False)
    corr_gold = pd.concat([corr_gold, corr_gold_temp[corr_gold_temp > th]])
    gold[list(corr_gold_temp[corr_gold_temp > th].index)] \
        = gold_temp[list(corr_gold_temp[corr_gold_temp > th].index)]
    

In [12]:
corr_gold.sort_values(0, ascending=False)

In [13]:
gold

In [22]:
train.head(1)

In [23]:
train['nearest_combo16'] = train.apply(lambda x: int(100000*x[2]+10000*x[3]+1000*x[4]
                                                     +100*x[5]+10*x[6]+x[7]), axis=1)

train['nearest_combo15'] = train.apply(lambda x: int(10000*x[2]+1000*x[3]+100*x[4]+10*x[5]+x[6]), axis=1)

train['nearest_combo14'] = train.apply(lambda x: int(1000*x[2]+100*x[3]+10*x[4]+x[5]), axis=1)

train['nearest_combo13'] = train.apply(lambda x: int(100*x[2]+10*x[3]+x[4]), axis=1)

train['nearest_combo12'] = train.apply(lambda x: int(10*x[2]+x[3]), axis=1)

In [24]:
gold_cols = [x.split('*') for x in gold.columns]

for gc in gold_cols:
    if gc[0] == 'multiply':
        train['*'.join(gc)] = train[gc[1]] * train[gc[2]]
    if gc[0] == 'divide':
        train['*'.join(gc)] = train[gc[1]] / train[gc[2]]
    if gc[0] == 'plus':
        train['*'.join(gc)] = train[gc[1]] + train[gc[2]]
    if gc[0] == 'minus':
        train['*'.join(gc)] = train[gc[1]] - train[gc[2]]

In [25]:
train

# CATBOOST SKF

In [28]:
models = []
k = 0

for train_index, val_index in StratifiedKFold(n_splits=6).split(train, train['label']):
    
    X_train, X_val = train.drop(['label'], axis=1).iloc[train_index], \
                     train.drop(['label'], axis=1).iloc[val_index]
    y_train, y_val = train['label'].iloc[train_index], train['label'].iloc[val_index]
    
    model = CatBoostClassifier(task_type='GPU', verbose=100, early_stopping_rounds=500,
                               iterations=3000)
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    models.append(model)

    model.save_model('model{}'.format(k))
    k += 1

In [29]:
test = pd.read_feather('../input/radiant/test_all_2.ftr')

In [30]:
test['nearest_combo16'] = test.apply(lambda x: int(100000*x[2]+10000*x[3]+1000*x[4]
                                                     +100*x[5]+10*x[6]+x[7]), axis=1)

test['nearest_combo15'] = test.apply(lambda x: int(10000*x[2]+1000*x[3]+100*x[4]+10*x[5]+x[6]), axis=1)

test['nearest_combo14'] = test.apply(lambda x: int(1000*x[2]+100*x[3]+10*x[4]+x[5]), axis=1)

test['nearest_combo13'] = test.apply(lambda x: int(100*x[2]+10*x[3]+x[4]), axis=1)

test['nearest_combo12'] = test.apply(lambda x: int(10*x[2]+x[3]), axis=1)

In [31]:
gold_cols = [x.split('*') for x in gold.columns]

for gc in gold_cols:
    if gc[0] == 'multiply':
        test['*'.join(gc)] = test[gc[1]] * train[gc[2]]
    if gc[0] == 'divide':
        test['*'.join(gc)] = test[gc[1]] / train[gc[2]]
    if gc[0] == 'plus':
        test['*'.join(gc)] = test[gc[1]] + train[gc[2]]
    if gc[0] == 'minus':
        test['*'.join(gc)] = test[gc[1]] - train[gc[2]]

In [34]:
# fip = pd.DataFrame()
# fip['col'] = X_train.columns
# fip['val'] = model.get_feature_importance()
# fip.tail(20)

In [35]:
preds = []
for model in models:
    preds.append(model.predict_proba(test))

In [36]:
sub = pd.read_csv('../input/radiant/SampleSubmission.csv')

pred_sub = pd.DataFrame()
pred_sub['Field ID'] = test.field_id

pred_sub[sub.columns[1:]] = np.mean(preds, axis=0)

pred_sub = pd.merge(sub['Field ID'], pred_sub, on=['Field ID'], how='left')

In [37]:
pred_sub

In [38]:
pred_sub.to_csv('sub_catskf_golden.csv', index=False)

# LAMA GEO WITH FEATURE GENERATION

In [39]:
# corr_train = np.abs(train[train.columns[2:1781]].corrwith(train['label']))
# cols = ['field_id', 'label']
# _ = [cols.append(x) for x in list(corr_train[corr_train > 0.15].index)]
# # print(len(cols))
# train = train[cols]
# train.shape

In [40]:
task = Task('multiclass', loss = 'crossentropy', metric = 'crossentropy')

roles = {
    'target': 'label',
#     'drop': 'field_id'
}

In [41]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 2 * 60 * 60 # Time in seconds for automl run

In [42]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
#                                lgb_params = {'default_params': {'device': 'gpu'}},
                               cb_params = {'default_params': {'task_type': 'GPU'}},
                               general_params={'use_algos': ['cb', 'lgb', 'cb_tuned', 'lgb_tuned']},
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train, roles = roles)

In [None]:
test = pd.read_feather('../input/radiant/test_all_2.ftr')

In [None]:
test['nearest_combo16'] = test.apply(lambda x: int(100000*x[2]+10000*x[3]+1000*x[4]
                                                     +100*x[5]+10*x[6]+x[7]), axis=1)

test['nearest_combo15'] = test.apply(lambda x: int(10000*x[2]+1000*x[3]+100*x[4]+10*x[5]+x[6]), axis=1)

test['nearest_combo14'] = test.apply(lambda x: int(1000*x[2]+100*x[3]+10*x[4]+x[5]), axis=1)

test['nearest_combo13'] = test.apply(lambda x: int(100*x[2]+10*x[3]+x[4]), axis=1)

test['nearest_combo12'] = test.apply(lambda x: int(10*x[2]+x[3]), axis=1)

In [None]:
gold_cols = [x.split('*') for x in gold.columns]

for gc in gold_cols:
    if gc[0] == 'multiply':
        test['*'.join(gc)] = test[gc[1]] * train[gc[2]]
    if gc[0] == 'divide':
        test['*'.join(gc)] = test[gc[1]] / train[gc[2]]
    if gc[0] == 'plus':
        test['*'.join(gc)] = test[gc[1]] + train[gc[2]]
    if gc[0] == 'minus':
        test['*'.join(gc)] = test[gc[1]] - train[gc[2]]

In [43]:
preds = automl.predict(test).data

In [44]:
class_map = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping

In [45]:
class_map

In [46]:
preds_ = np.zeros(preds.shape)

In [47]:
for lab in class_map:
    preds_[:, lab-1] = preds[:, class_map[lab]]

In [48]:
sub = pd.read_csv('../input/radiant/SampleSubmission.csv')

pred_sub = pd.DataFrame()
pred_sub['Field ID'] = test.field_id

pred_sub[sub.columns[1:]] = preds_

pred_sub = pd.merge(sub['Field ID'], pred_sub, on=['Field ID'], how='left')

In [49]:
pred_sub

In [50]:
pred_sub.to_csv('sub_lama_geo_golden.csv', index=False)

In [51]:
task = Task('multiclass', loss = 'crossentropy', metric = 'crossentropy')

roles = {
    'target': 'label',
#     'drop': 'field_id'
}

In [52]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 50 * 60 # Time in seconds for automl run

In [53]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
#                                lgb_params = {'default_params': {'device': 'gpu'}},
                               cb_params = {'default_params': {'task_type': 'GPU'}},
                               general_params={'use_algos': ['cb_tuned', 'lgb_tuned']},
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train, roles = roles)

In [54]:
preds = automl.predict(test).data

In [55]:
class_map = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping

In [56]:
class_map

In [57]:
preds_ = np.zeros(preds.shape)

In [58]:
for lab in class_map:
    preds_[:, lab-1] = preds[:, class_map[lab]]

In [59]:
sub = pd.read_csv('../input/radiant/SampleSubmission.csv')

pred_sub = pd.DataFrame()
pred_sub['Field ID'] = test.field_id

pred_sub[sub.columns[1:]] = preds_

pred_sub = pd.merge(sub['Field ID'], pred_sub, on=['Field ID'], how='left')

In [60]:
pred_sub

In [61]:
pred_sub.to_csv('sub_lama_geo_golden_2.csv', index=False)