In [1]:
import pandas as pd
import numpy as np
import typing
import torch

from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [2]:
data_p1_link = 'part1_compressed.pkl'
data_p2_link = 'part2_compressed.pkl'
data_test_link = 'test_data_compressed.pkl'
submission_link = 'submission.csv'

In [3]:
data_p1 = pd.read_pickle(data_p1_link)
data_p2 = pd.read_pickle(data_p2_link)
data_full = pd.concat([data_p1, data_p2], axis=0)
del data_p1, data_p2

In [4]:
data_full.drop(['x_0', 'x_1', 'ID'], axis=1, inplace=True)

In [5]:
data_full

Unnamed: 0,REPORT_DT,x_2,x_3,x_4,x_5,x_7,x_9,x_10,x_11,x_12,...,x_638,x_639,x_640,x_641,x_642,x_643,x_644,x_645,x_646,TARGET
0,2019-08-25,1,1,2100000.0,1.505829e+06,1.505829e+06,2017-03-20,1400000.0,240,B1,...,15000.000000,11.75,0.436268,0.436268,40.257000,28,22757.849609,0.505730,0.716184,0
1,2019-06-27,1,1,900000.0,3.635353e+05,3.635353e+05,2017-02-02,900000.0,60,B1,...,0.000000,12.25,0.321785,0.321785,12.000000,28,20133.890625,0.268452,0.268452,0
2,2018-01-09,1,1,1000000.0,9.909567e+05,9.820692e+05,2017-09-20,1000000.0,84,D,...,50000.000000,7.90,0.284704,0.284704,,4,15536.440430,0.154156,0.348582,0
3,2019-09-03,1,1,1400000.0,5.663386e+05,5.640454e+05,2015-02-06,1400000.0,180,N,...,0.000000,15.75,0.274333,0.274333,16.470589,55,20318.310547,0.239039,0.239039,0
4,2017-12-18,1,1,2114500.0,1.414694e+06,1.396392e+06,2014-10-13,1700000.0,240,A,...,100148.578125,13.00,0.382492,0.382492,16.700842,36,24772.970703,0.190561,0.238575,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943522,2018-01-21,1,1,900000.0,8.691581e+05,8.691581e+05,2016-08-29,900000.0,180,D,...,35000.000000,11.90,0.459190,0.459190,36.075230,16,10743.679688,0.200442,0.595867,0
1943523,2018-11-05,1,1,1260000.0,1.058132e+06,1.053794e+06,2013-08-12,1360000.0,180,B1,...,20000.000000,12.00,0.495197,0.495197,36.773357,60,15122.120117,0.280039,0.458246,0
1943525,2018-09-26,1,1,884000.0,7.373164e+05,7.373164e+05,2014-03-03,765000.0,180,D,...,0.000000,13.25,0.382296,0.382296,29.826376,51,11330.549805,0.441771,0.441764,1
1943526,2018-02-20,2,2,1981000.0,1.472026e+06,1.373922e+06,2017-01-31,1981000.0,360,B1,...,0.000000,11.75,0.427794,0.427794,35.783009,11,19996.410156,0.266619,0.303610,0


In [6]:
data_full['REPORT_DT'] = data_full['REPORT_DT'].astype('datetime64')
data_full['x_9'] = data_full['x_9'].astype('datetime64')

In [7]:
nans = pd.isna(data_full).mean(axis=0)
nan_cols = data_full.loc[:, nans > 0.9].columns
nan_cols

Index(['x_17', 'x_25', 'x_26', 'x_27', 'x_101', 'x_102', 'x_103', 'x_105',
       'x_106', 'x_107', 'x_112', 'x_113', 'x_114', 'x_116', 'x_117', 'x_118',
       'x_204', 'x_205', 'x_206', 'x_208', 'x_209', 'x_210', 'x_215', 'x_216',
       'x_217', 'x_219', 'x_220', 'x_221', 'x_225', 'x_226', 'x_307', 'x_308',
       'x_309', 'x_311', 'x_312', 'x_313', 'x_318', 'x_319', 'x_320', 'x_322',
       'x_323', 'x_324', 'x_343', 'x_344', 'x_345', 'x_355', 'x_356', 'x_357',
       'x_367', 'x_368', 'x_369', 'x_492', 'x_494', 'x_496', 'x_498', 'x_499'],
      dtype='object')

In [8]:
data_full.drop(nan_cols, axis=1, inplace=True)

In [9]:
obj_cols = data_full.select_dtypes('object').columns
obj_cols

Index(['x_12', 'x_13', 'x_18', 'x_19', 'x_21', 'x_614', 'x_615', 'x_617',
       'x_618', 'x_625', 'x_628', 'x_634'],
      dtype='object')

In [23]:
def make_submission(clf, process_data, model_name='baseline'):
    
    print("Reading test data...")
    # Creating submission file
    data_test = pd.read_pickle(data_test_link)
    
    print("Preprocessing test data...")
    X = process_data(data_test)
    submission = pd.read_csv(submission_link)

    print("Making predictions...")
    submission['Probability'] = clf.predict_proba(X)[:,1]
    
    
    submission_folder = f'submissions/{model_name}'
    os.mkdir(submission_folder)
    
    # Saving submission
    print("Saving submission...")
    submission.to_csv(os.path.join(submission_folder, f"submission_{model_name}.csv"),index=False)
    
    
    print("Saving model...")
    # Saving the model
    with open(os.path.join(submission_folder, f'model_{model_name}.pkl'), 'wb') as files:
        pickle.dump(clf, files)
    
    
    print("Saving features...")
    # Saving features
    with open(os.path.join(submission_folder, f'features_{model_name}.pkl'), 'wb') as files:
        pickle.dump(X, files)  
    imports = list(set(get_imports()))

    
    print("Saving requirements...")
    # Updating requirements
    requirements = []
    for m in pkg_resources.working_set:
        if m.project_name in imports and m.project_name!="pip":
            requirements.append((m.project_name, m.version))
        
    with open(os.path.join(submission_folder, 'requirements.txt'), 'w') as f:
        for r in requirements:
            f.write("{}=={} \n".format(*r))
    
    print("Submission saved!")

In [27]:
data_p1 = pd.read_pickle(data_p1_link)

nans = pd.isna(data_p1).mean(axis=0)
nan_cols = data_p1.loc[:, nans > 0.9].columns
to_drop = list(nan_cols) + ['x_9', 'REPORT_DT', 'ID']

def calc_features(X):
    return X.iloc[:, [x for x in range(20)] + [x for x in range(20, 470, 50)] + [x for x in range(-1, -50, -1)]]

def manage_categorical(data_p1):
    obj_cols = data_p1.select_dtypes('object').columns
    for col in tqdm(obj_cols):
        if len(data_p1[col].value_counts()) < 2:
            data_p1.drop([col], axis=1, inplace=True)

    obj_cols = data_p1.select_dtypes('object').columns
    data_p1 = pd.get_dummies(data_p1, obj_cols)
    data_p1.drop(obj_cols, axis=1, inplace=True)
    return data_p1

def process_data(data_p1, test=False):
    print("Dropping nans...")
    data_p1.drop(to_drop, axis=1, inplace=True)
    
    print("Optimizing memory size...")
    fcols = data_p1.select_dtypes('float').columns
    icols = data_p1.select_dtypes('integer').columns

    data_p1[fcols] = data_p1[fcols].apply(pd.to_numeric, downcast='float')
    data_p1[icols] = data_p1[icols].apply(pd.to_numeric, downcast='integer')
    
    print("Managing binary features...")
    binary = ['x_19', 'x_614', 'x_615', 'x_634']
    for col in binary:
        data_p1[col] = data_p1[col].astype('bool')
    
    data_p1.fillna(-999, inplace=True)
    if not test:
        X = data_p1.drop(['TARGET'], axis=1)
        y = data_p1.TARGET
        X = calc_features(X)
        return X, y
    else:
        return calc_features(X)
    
X, y = process_data(data_p1)

Dropping nans...
Optimizing memory size...
Managing binary features...


In [10]:
data_full

Unnamed: 0,REPORT_DT,x_2,x_3,x_4,x_5,x_7,x_9,x_10,x_11,x_12,...,x_638,x_639,x_640,x_641,x_642,x_643,x_644,x_645,x_646,TARGET
0,2019-08-25,1,1,2100000.0,1.505829e+06,1.505829e+06,2017-03-20,1400000.0,240,B1,...,15000.000000,11.75,0.436268,0.436268,40.257000,28,22757.849609,0.505730,0.716184,0
1,2019-06-27,1,1,900000.0,3.635353e+05,3.635353e+05,2017-02-02,900000.0,60,B1,...,0.000000,12.25,0.321785,0.321785,12.000000,28,20133.890625,0.268452,0.268452,0
2,2018-01-09,1,1,1000000.0,9.909567e+05,9.820692e+05,2017-09-20,1000000.0,84,D,...,50000.000000,7.90,0.284704,0.284704,,4,15536.440430,0.154156,0.348582,0
3,2019-09-03,1,1,1400000.0,5.663386e+05,5.640454e+05,2015-02-06,1400000.0,180,N,...,0.000000,15.75,0.274333,0.274333,16.470589,55,20318.310547,0.239039,0.239039,0
4,2017-12-18,1,1,2114500.0,1.414694e+06,1.396392e+06,2014-10-13,1700000.0,240,A,...,100148.578125,13.00,0.382492,0.382492,16.700842,36,24772.970703,0.190561,0.238575,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943522,2018-01-21,1,1,900000.0,8.691581e+05,8.691581e+05,2016-08-29,900000.0,180,D,...,35000.000000,11.90,0.459190,0.459190,36.075230,16,10743.679688,0.200442,0.595867,0
1943523,2018-11-05,1,1,1260000.0,1.058132e+06,1.053794e+06,2013-08-12,1360000.0,180,B1,...,20000.000000,12.00,0.495197,0.495197,36.773357,60,15122.120117,0.280039,0.458246,0
1943525,2018-09-26,1,1,884000.0,7.373164e+05,7.373164e+05,2014-03-03,765000.0,180,D,...,0.000000,13.25,0.382296,0.382296,29.826376,51,11330.549805,0.441771,0.441764,1
1943526,2018-02-20,2,2,1981000.0,1.472026e+06,1.373922e+06,2017-01-31,1981000.0,360,B1,...,0.000000,11.75,0.427794,0.427794,35.783009,11,19996.410156,0.266619,0.303610,0


In [11]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 100 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [18]:
task = Task('binary', loss = 'logloss', metric = 'auc')

roles = {
    'target': 'TARGET',
    'drop': 'ID'
}

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(data_full, roles = roles)

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}
Start automl preset with listed constraints:
- time: 99.99722480773926 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (1527598, 589)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left -139.5373821258545 secs


In [50]:
submission = pd.read_csv(submission_link)
data_test = pd.read_pickle(data_test_link)

In [54]:
automl.predict(data_test[:10]).data[:, 0]

array([0.01389652, 0.00997681, 0.02127077, 0.09761693, 0.03590306,
       0.00954098, 0.02937284, 0.01050281, 0.01862199, 0.01106291],
      dtype=float32)

In [52]:
submission['Probability'] = automl.predict(data_test).data[:, 0]

In [53]:
submission.to_csv('submission_full.csv',index=False)