In [1]:
import pandas as pd
import numpy as np
import typing
import torch
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [2]:
data_p1_link = 'part1_compressed.pkl'
data_p2_link = 'part2_compressed.pkl'
data_test_link = 'test_data_compressed.pkl'
submission_link = 'submission.csv'

In [3]:
data_p1 = pd.read_pickle(data_p1_link)
data_p2 = pd.read_pickle(data_p2_link)
data_full = pd.concat([data_p1, data_p2], axis=0)
del data_p1, data_p2

In [4]:
data_full.drop(['x_0', 'x_1', 'ID', 'x_25', 'x_26', 'x_27'], axis=1, inplace=True)

In [5]:
print(len(data_full))

1527598


In [6]:
data_full['REPORT_DT'] = data_full['REPORT_DT'].astype('datetime64')
data_full['x_9'] = data_full['x_9'].astype('datetime64')

In [7]:
data_full[data_full['TARGET']==1]

Unnamed: 0,REPORT_DT,x_2,x_3,x_4,x_5,x_7,x_9,x_10,x_11,x_12,...,x_638,x_639,x_640,x_641,x_642,x_643,x_644,x_645,x_646,TARGET
30,2018-06-05,1,1,600000.0,1.470256e+05,1.470256e+05,2017-05-25,880000.0,360,B1,...,0.000000,11.25,0.814865,0.814865,76.705284,9,5827.569824,0.171399,0.497395,1
50,2018-08-12,1,1,1539000.0,1.510639e+06,1.510639e+06,2017-03-01,1400000.0,240,B1,...,75000.000000,10.40,0.390584,0.390584,,16,15261.830078,0.113894,0.742840,1
239,2018-08-12,1,1,1080000.0,4.634063e+05,4.632652e+05,2013-07-02,1080000.0,180,B1,...,5320.919922,12.00,1.056370,1.056370,88.018318,59,12961.820312,0.231461,1.150102,1
249,2018-07-29,1,1,1450000.0,1.009675e+06,1.009675e+06,2014-08-13,1500000.0,120,D,...,0.000000,13.75,0.420288,0.420288,17.241379,47,22296.189453,0.256278,0.256278,1
323,2019-06-09,1,1,1760000.0,1.690328e+06,1.690328e+06,2017-01-13,1600000.0,240,A,...,0.000000,10.50,0.943669,0.943669,46.408504,29,17571.490234,0.251021,0.437199,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943435,2018-05-14,1,1,865000.0,4.238303e+05,4.238303e+05,2014-12-19,870000.0,180,B1,...,0.000000,12.50,0.436243,0.436243,,37,10661.320312,0.266533,0.433736,1
1943457,2017-05-14,1,1,800000.0,6.906468e+05,6.906468e+05,2015-12-30,1100000.0,240,D,...,0.000000,13.95,0.472732,0.472732,38.126793,15,9919.129883,0.330638,0.343805,1
1943516,2018-05-13,1,1,940000.0,5.004962e+05,5.004962e+05,2016-02-11,1000000.0,240,B1,...,14612.750000,13.25,0.699172,0.699172,32.597645,25,11180.650391,0.223613,0.263472,1
1943517,2019-09-29,1,1,1600000.0,1.400450e+06,1.400450e+06,2015-05-05,1500000.0,150,D,...,0.000000,15.75,0.609451,0.609451,31.563166,52,24459.310547,0.470371,0.514675,1


In [8]:
to_drop = data_full.iloc[:, 22:608].columns
data_full.drop(to_drop, axis=1, inplace=True)

In [9]:
nans = pd.isna(data_full).mean(axis=0)
nan_cols = data_full.loc[:, nans > 0.9].columns
nan_cols
data_full.drop(nan_cols, axis=1, inplace=True)

In [10]:
data_full

Unnamed: 0,REPORT_DT,x_2,x_3,x_4,x_5,x_7,x_9,x_10,x_11,x_12,...,x_638,x_639,x_640,x_641,x_642,x_643,x_644,x_645,x_646,TARGET
0,2019-08-25,1,1,2100000.0,1.505829e+06,1.505829e+06,2017-03-20,1400000.0,240,B1,...,15000.000000,11.75,0.436268,0.436268,40.257000,28,22757.849609,0.505730,0.716184,0
1,2019-06-27,1,1,900000.0,3.635353e+05,3.635353e+05,2017-02-02,900000.0,60,B1,...,0.000000,12.25,0.321785,0.321785,12.000000,28,20133.890625,0.268452,0.268452,0
2,2018-01-09,1,1,1000000.0,9.909567e+05,9.820692e+05,2017-09-20,1000000.0,84,D,...,50000.000000,7.90,0.284704,0.284704,,4,15536.440430,0.154156,0.348582,0
3,2019-09-03,1,1,1400000.0,5.663386e+05,5.640454e+05,2015-02-06,1400000.0,180,N,...,0.000000,15.75,0.274333,0.274333,16.470589,55,20318.310547,0.239039,0.239039,0
4,2017-12-18,1,1,2114500.0,1.414694e+06,1.396392e+06,2014-10-13,1700000.0,240,A,...,100148.578125,13.00,0.382492,0.382492,16.700842,36,24772.970703,0.190561,0.238575,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943522,2018-01-21,1,1,900000.0,8.691581e+05,8.691581e+05,2016-08-29,900000.0,180,D,...,35000.000000,11.90,0.459190,0.459190,36.075230,16,10743.679688,0.200442,0.595867,0
1943523,2018-11-05,1,1,1260000.0,1.058132e+06,1.053794e+06,2013-08-12,1360000.0,180,B1,...,20000.000000,12.00,0.495197,0.495197,36.773357,60,15122.120117,0.280039,0.458246,0
1943525,2018-09-26,1,1,884000.0,7.373164e+05,7.373164e+05,2014-03-03,765000.0,180,D,...,0.000000,13.25,0.382296,0.382296,29.826376,51,11330.549805,0.441771,0.441764,1
1943526,2018-02-20,2,2,1981000.0,1.472026e+06,1.373922e+06,2017-01-31,1981000.0,360,B1,...,0.000000,11.75,0.427794,0.427794,35.783009,11,19996.410156,0.266619,0.303610,0


In [27]:
data_p1 = pd.read_pickle(data_p1_link)

nans = pd.isna(data_p1).mean(axis=0)
nan_cols = data_p1.loc[:, nans > 0.6].columns
to_drop = list(nan_cols) + ['x_9', 'REPORT_DT', 'ID']

def calc_features(X):
    return X.iloc[:, [x for x in range(20)] + [x for x in range(20, 470, 50)] + [x for x in range(-1, -50, -1)]]

def manage_categorical(data_p1):
    obj_cols = data_p1.select_dtypes('object').columns
    for col in tqdm(obj_cols):
        if len(data_p1[col].value_counts()) < 2:
            data_p1.drop([col], axis=1, inplace=True)

    obj_cols = data_p1.select_dtypes('object').columns
    data_p1 = pd.get_dummies(data_p1, obj_cols)
    data_p1.drop(obj_cols, axis=1, inplace=True)
    return data_p1

def process_data(data_p1, test=False):
    print("Dropping nans...")
    data_p1.drop(to_drop, axis=1, inplace=True)
    
    print("Optimizing memory size...")
    fcols = data_p1.select_dtypes('float').columns
    icols = data_p1.select_dtypes('integer').columns

    data_p1[fcols] = data_p1[fcols].apply(pd.to_numeric, downcast='float')
    data_p1[icols] = data_p1[icols].apply(pd.to_numeric, downcast='integer')
    
    print("Managing binary features...")
    binary = ['x_19', 'x_614', 'x_615', 'x_634']
    for col in binary:
        data_p1[col] = data_p1[col].astype('bool')
    
    data_p1.fillna(-999, inplace=True)
    if not test:
        X = data_p1.drop(['TARGET'], axis=1)
        y = data_p1.TARGET
        X = calc_features(X)
        return X, y
    else:
        return calc_features(X)
    
X, y = process_data(data_p1)

Dropping nans...
Optimizing memory size...
Managing binary features...


In [32]:
y.value_counts()

0    732535
1     31266
Name: TARGET, dtype: int64

In [11]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 1000 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [12]:
task = Task('binary', loss = 'logloss', metric = 'auc')

roles = {
    'target': 'TARGET',
    'drop': 'ID'
}

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(data_full, roles = roles)

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}
Start automl preset with listed constraints:
- time: 999.9963881969452 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (1527598, 55)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 979.3445963859558 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.8744611579036184
Linear model: C = 5e-05 score = 0.8772951430907521
Linear model: C = 0.0001 score = 0.8778625763484516
Linear model: C = 0.0005 score = 0.8788351183529427
Linear model: C = 0.001 score = 0.8788351183529427
Linear model: C = 0.005 score = 0.8788351183529427

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Time limit exceeded after calculating fold 3


Lvl_0_Pipe_0_Mod_0_LinearL2 fitting and predicting completed
Time left 859.4340851306915
Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.900352
[200]	valid's auc: 0.903489
[300]	valid's auc: 0.905481


In [108]:
submission = pd.read_csv(submission_link)
data_test = pd.read_pickle(data_test_link)

In [109]:
automl.predict(data_test[:10]).data[:, 0]

array([0.00167889, 0.01322867, 0.0001766 , 0.02840564, 0.00096806,
       0.00267912, 0.00119659, 0.00285337, 0.00749729, 0.00220884],
      dtype=float32)

In [110]:
submission['Probability'] = automl.predict(data_test).data[:, 0]

In [111]:
submission.to_csv('submission_full_55features_1700_5fold.csv',index=False)