In [1]:
import pandas as pd
import numpy as np
import typing
import torch
from tqdm import tqdm

from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [2]:
data_p1_link = 'part1_compressed.pkl'
data_p2_link = 'part2_compressed.pkl'
data_test_link = 'test_data_compressed.pkl'
submission_link = 'submission.csv'

In [3]:
data_p1 = pd.read_pickle(data_p1_link)
data_p2 = pd.read_pickle(data_p2_link)
data_full = pd.concat([data_p1, data_p2], axis=0)
del data_p1, data_p2

In [4]:
data_full.drop(['x_0', 'x_1', 'ID'], axis=1, inplace=True)

In [5]:
data_full['REPORT_DT'] = data_full['REPORT_DT'].astype('datetime64')
data_full['x_9'] = data_full['x_9'].astype('datetime64')

In [6]:
nans = pd.isna(data_full).mean(axis=0)
nan_cols = data_full.loc[:, nans > 0.9].columns
nan_cols

Index(['x_17', 'x_25', 'x_26', 'x_27', 'x_101', 'x_102', 'x_103', 'x_105',
       'x_106', 'x_107', 'x_112', 'x_113', 'x_114', 'x_116', 'x_117', 'x_118',
       'x_204', 'x_205', 'x_206', 'x_208', 'x_209', 'x_210', 'x_215', 'x_216',
       'x_217', 'x_219', 'x_220', 'x_221', 'x_225', 'x_226', 'x_307', 'x_308',
       'x_309', 'x_311', 'x_312', 'x_313', 'x_318', 'x_319', 'x_320', 'x_322',
       'x_323', 'x_324', 'x_343', 'x_344', 'x_345', 'x_355', 'x_356', 'x_357',
       'x_367', 'x_368', 'x_369', 'x_492', 'x_494', 'x_496', 'x_498', 'x_499'],
      dtype='object')

In [7]:
data_full.drop(nan_cols, axis=1, inplace=True)

In [8]:
obj_cols = data_full.select_dtypes('object').columns
obj_cols

Index(['x_12', 'x_13', 'x_18', 'x_19', 'x_21', 'x_614', 'x_615', 'x_617',
       'x_618', 'x_625', 'x_628', 'x_634'],
      dtype='object')

In [9]:
def make_submission(clf, process_data, model_name='baseline'):
    
    print("Reading test data...")
    # Creating submission file
    data_test = pd.read_pickle(data_test_link)
    
    print("Preprocessing test data...")
    X = process_data(data_test)
    submission = pd.read_csv(submission_link)

    print("Making predictions...")
    submission['Probability'] = clf.predict_proba(X)[:,1]
    
    
    submission_folder = f'submissions/{model_name}'
    os.mkdir(submission_folder)
    
    # Saving submission
    print("Saving submission...")
    submission.to_csv(os.path.join(submission_folder, f"submission_{model_name}.csv"),index=False)
    
    
    print("Saving model...")
    # Saving the model
    with open(os.path.join(submission_folder, f'model_{model_name}.pkl'), 'wb') as files:
        pickle.dump(clf, files)
    
    
    print("Saving features...")
    # Saving features
    with open(os.path.join(submission_folder, f'features_{model_name}.pkl'), 'wb') as files:
        pickle.dump(X, files)  
    imports = list(set(get_imports()))

    
    print("Saving requirements...")
    # Updating requirements
    requirements = []
    for m in pkg_resources.working_set:
        if m.project_name in imports and m.project_name!="pip":
            requirements.append((m.project_name, m.version))
        
    with open(os.path.join(submission_folder, 'requirements.txt'), 'w') as f:
        for r in requirements:
            f.write("{}=={} \n".format(*r))
    
    print("Submission saved!")

In [None]:
nans = pd.isna(data_full).mean(axis=0)
nan_cols = data_full.loc[:, nans > 0.6].columns
to_drop = list(nan_cols) #+ ['REPORT_DT', 'x_9']

In [28]:
def calc_features(X):
    return X.iloc[:, [x for x in range(20)] + [x for x in range(20, 470, 50)] + [x for x in range(-1, -50, -1)]]

def manage_categorical(data_p1):
    obj_cols = data_p1.select_dtypes('object').columns
    for col in tqdm(obj_cols):
        if len(data_p1[col].value_counts()) < 2:
            data_p1.drop([col], axis=1, inplace=True)

    obj_cols = data_p1.select_dtypes('object').columns
    data_p1 = pd.get_dummies(data_p1, obj_cols)
    return data_p1

def process_data(data_p1, test=False):
    print("Dropping nans...")
    data_p1.drop(to_drop, axis=1, inplace=True)
    
    print("Optimizing memory size...")
    fcols = data_p1.select_dtypes('float').columns
    icols = data_p1.select_dtypes('integer').columns

    data_p1[fcols] = data_p1[fcols].apply(pd.to_numeric, downcast='float')
    data_p1[icols] = data_p1[icols].apply(pd.to_numeric, downcast='integer')
    
    print("Managing binary features...")
    binary = ['x_19', 'x_614', 'x_615', 'x_634']
    for col in binary:
        data_p1[col] = data_p1[col].astype('bool')
    
    data_p1 = manage_categorical(data_p1)
    data_p1.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in data_p1:
        data_p1[col] = data_p1[col].fillna(data_p1[col].min())

    if not test:
        X = data_p1.drop(['TARGET'], axis=1)
        y = data_p1.TARGET
        X = calc_features(X)
        return X, y
    else:
        return calc_features(data_p1)

In [None]:
X, y = process_data(data_full)

In [15]:
from sklearn.decomposition import PCA

In [16]:
pca = PCA(n_components=25)

principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = [f'feat_{i}' for i in range(25)])

In [17]:
principalDf

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21,feat_22,feat_23,feat_24
0,-1.882170e+05,-3.156643e+05,-584859.515127,-461691.264138,-16341.311714,-5567.110464,-3782.478100,51.588801,12.073609,-17.779703,...,8.695770,-10.447067,10.693456,4.099311,-6.936395,-0.260632,-1.222521,0.005835,1.150485,0.172734
1,-6.083839e+05,1.579804e+06,212858.851286,-43166.944334,-13572.018073,775.025644,7010.619687,-62.250522,-33.899142,27.582542,...,-1.842159,-7.082502,-6.271709,2.793002,14.043670,3.928277,0.196572,-0.015602,1.298343,0.254371
2,-1.097187e+05,1.082069e+06,-414793.871941,123423.180092,5267.112275,-4174.855260,2907.228469,-84.547178,67.895899,-23.590466,...,13.767333,21.866102,4.084819,0.200372,1.899789,-0.527774,0.055437,-0.011157,-1.921198,-0.267402
3,-6.144734e+05,6.515768e+05,440995.028447,-136799.039384,-20046.293105,-1059.572125,879.250979,43.491257,33.273924,-12.493222,...,-0.131080,-20.708132,-7.839162,5.144651,-0.917453,0.304193,-0.039059,-0.020866,0.476014,-0.541219
4,5.859008e+06,1.435509e+06,-373811.791839,-276599.252269,-10924.722365,12100.728823,-2802.063548,66.882598,-83.036727,14.265420,...,-11.704777,9.484274,10.792937,9.014152,-2.509462,-5.791207,-1.868457,-0.019029,-1.906201,0.651463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1527593,-2.018008e+06,7.029689e+05,-324022.682489,100458.750181,86.464249,-2679.851886,-872.656067,8.981095,14.868403,5.388994,...,13.898483,-3.968962,0.833477,22.423496,-3.909520,-0.086887,-0.545843,-0.028077,-1.906683,-0.146952
1527594,-9.352255e+05,2.540545e+05,-150943.304979,113553.339315,-2516.349711,-2341.025560,-964.964265,2.338165,-18.180007,19.063068,...,-22.950892,-19.070180,-10.186108,-6.885516,4.417872,-1.462161,-1.172860,-0.029032,1.038127,0.510312
1527595,-2.822460e+06,6.943688e+05,-268361.715678,-11399.327150,-2102.141563,-2960.867627,-434.650419,16.919036,-3.059102,22.622427,...,-4.737990,-10.445873,-11.144017,8.420415,2.279181,-2.762293,0.925595,-0.044601,0.450913,-1.054542
1527596,-3.715739e+05,-8.548423e+05,-6026.633007,-51614.049691,53643.791612,2509.103012,-5195.351635,194.370192,-84.217123,56.288018,...,13.299234,6.534893,-13.757871,5.176278,10.250627,1.155234,-1.250762,-0.071367,1.336033,0.361013


In [18]:
principalDf.index = y.index

In [19]:
df_full = pd.concat([principalDf, y], axis=1)

In [20]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 100 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [21]:
task = Task('binary', loss = 'logloss', metric = 'auc')

roles = {
    'target': 'TARGET',
    'drop': 'ID'
}

In [22]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(df_full, roles = roles)

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}
Start automl preset with listed constraints:
- time: 99.99610877037048 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (1527598, 26)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 91.66211700439453 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.7321298362790238
Linear model: C = 5e-05 score = 0.7355466122975516
Linear model: C = 0.0001 score = 0.7386454873806563
Linear model: C = 0.0005 score = 0.7473249242262413
Linear model: C = 0.001 score = 0.7473249242262413
Linear model: C = 0.005 score = 0.7672074951064396
Linear model: C = 0.01 score = 0.7673852274588933
Linear model: C = 0.

Time limit exceeded after calculating fold 0


Lvl_0_Pipe_0_Mod_0_LinearL2 fitting and predicting completed
Time left 75.53711700439453
Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.886918
[200]	valid's auc: 0.908039
[300]	valid's auc: 0.917408
[400]	valid's auc: 0.922665
[500]	valid's auc: 0.925904
[600]	valid's auc: 0.927991
[700]	valid's auc: 0.929664
[800]	valid's auc: 0.930731
[900]	valid's auc: 0.931431
[1000]	valid's auc: 0.932064
[1100]	valid's auc: 0.932488
[1200]	valid's auc: 0.932926
[1300]	valid's auc: 0.93318
[1400]	valid's auc: 0.933278
[1500]	valid's auc: 0.933358
[1600]	valid's auc: 0.933462
[1700]	valid's auc: 0.933554
Early stopping, best iteration is:
[1662]	valid's auc: 0.933581


Time limit exceeded after calculating fold 0


Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Time left -58.975549936294556


Time limit exceeded. Last level models will be blended and unused pipelines will be pruned.


Blending: Optimization starts with equal weights and score 0.9167669743793699
Blending, iter 0: score = 0.933580865979721, weights = [0. 1.]
Blending, iter 1: score = 0.933580865979721, weights = [0. 1.]
No score update. Terminated

Automl preset training completed in 160.35 seconds.


In [24]:
del df_full, data_full

In [30]:
submission = pd.read_csv(submission_link)
data_test = pd.read_pickle(data_test_link)

In [31]:
X = process_data(data_test, test=True)

Dropping nans...
Optimizing memory size...
Managing binary features...


100%|██████████| 12/12 [00:03<00:00,  3.38it/s]


In [32]:
principalComponents = pca.transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = [f'feat_{i}' for i in range(25)])

In [35]:
principalDf

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21,feat_22,feat_23,feat_24
0,-4.357259e+06,1.474664e+06,-665438.441422,3.011375e+05,-7.627696e+05,65618.501646,1.940144e+06,1.750604e+04,-8.432848e+05,125255.685830,...,-4.682075e+05,-1.216449e+05,52017.931413,9.897519e+05,-253509.269939,-146705.418042,-21321.179501,1750.526332,-573.716526,-9438.677177
1,-3.873159e+06,-2.518513e+04,-680418.567390,1.146737e+06,-1.791460e+06,67163.179768,1.925661e+06,1.218724e+06,-2.538857e+06,436976.543838,...,-1.573644e+06,-5.505675e+05,-67734.696811,2.557776e+06,-310172.809026,-234411.926036,-98536.158138,5146.655734,-7121.635725,-33486.696354
2,-4.309931e+06,1.331033e+06,-628787.089312,3.844375e+05,-8.323232e+05,65721.871176,1.939833e+06,4.558659e+04,-9.574130e+05,147011.400790,...,-4.796740e+05,-9.120578e+04,105858.371892,1.197446e+06,-378959.531381,-207419.004211,-16480.408565,1955.258970,359.862330,-9131.853136
3,-4.335313e+06,1.442627e+06,-214806.673845,3.467436e+05,-4.418461e+05,65499.754082,1.936288e+06,3.316740e+05,-7.966590e+05,124825.815449,...,-6.342332e+05,-3.002719e+05,-149115.613428,5.996728e+05,151934.448217,38244.867349,-51973.617351,1694.125037,-5004.015531,-14803.171880
4,-4.188126e+06,9.771988e+05,-343292.231979,6.016086e+05,-8.552602e+05,66099.505068,1.933090e+06,6.004052e+05,-1.561031e+06,270917.282631,...,-7.816107e+05,-1.634710e+05,140792.907274,1.872933e+06,-549162.878381,-305615.480802,-30982.057244,3098.139994,-339.101993,-14865.626134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763804,-4.236159e+06,1.130479e+06,-289465.007688,4.970363e+05,-7.102192e+05,111424.329197,3.279729e+06,5.219928e+05,-1.294392e+06,204574.176678,...,-9.841414e+05,-4.448780e+05,-198091.023588,1.046029e+06,160238.211557,23207.326779,-77243.376864,2733.447662,-7170.546720,-22586.016237
763805,-4.285755e+06,1.237971e+06,-865247.415642,4.011969e+05,-1.075100e+06,113436.663158,3.284459e+06,-7.593246e+04,-2.254839e+06,276987.515157,...,-2.114805e+06,-1.113859e+06,-657473.245682,1.377818e+06,877081.225199,305860.837514,-187983.615683,5150.533806,-18911.694176,-51431.309803
763806,-4.330399e+06,1.417786e+06,-343654.481310,3.315681e+05,-5.565366e+05,112127.797680,3.282470e+06,1.777508e+05,-1.486829e+06,185957.169324,...,-1.544826e+06,-8.791510e+05,-587916.783033,6.477705e+05,882813.603058,339207.204915,-148741.377508,3436.963766,-15936.250197,-38520.080240
763807,-4.395601e+06,1.612284e+06,-436550.091307,2.161181e+05,-4.952618e+05,111280.049304,3.285652e+06,1.804630e+03,-9.991343e+05,136343.187113,...,-7.261885e+05,-3.011004e+05,-97145.441452,9.186395e+05,9625.343271,-34552.722312,-51344.514213,2167.421809,-4158.737143,-16334.105413


In [36]:
automl.predict(principalDf[:10]).data[:, 0]

array([0.83880115, 0.8303851 , 0.8525569 , 0.6021569 , 0.80024403,
       0.88971496, 0.80499923, 0.78403276, 0.7638874 , 0.7985945 ],
      dtype=float32)

In [40]:
submission['Probability'] = 1-automl.predict(principalDf).data[:, 0]

In [41]:
submission

Unnamed: 0,ID,Probability
0,1943531,0.161199
1,1943532,0.169615
2,1943533,0.147443
3,1943534,0.397843
4,1943537,0.199756
...,...,...
763804,3290365,0.373659
763805,3290366,0.112065
763806,3290367,0.382868
763807,3290368,0.610013


In [44]:
submission.to_csv('submission_PCA_automl_1-.csv',index=False)

In [45]:
submission

Unnamed: 0,ID,Probability
0,1943531,0.161199
1,1943532,0.169615
2,1943533,0.147443
3,1943534,0.397843
4,1943537,0.199756
...,...,...
763804,3290365,0.373659
763805,3290366,0.112065
763806,3290367,0.382868
763807,3290368,0.610013
