In [2]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import pickle
from shutil import copyfile
import os
SEED = 42
from tqdm.notebook import tqdm
import pkg_resources
import types


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier


def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name
        
data_p1_link = 'part1_compressed.pkl'
data_p2_link = 'part2_compressed.pkl'
data_test_link = 'test_data_compressed.pkl'
submission_link = 'submission.csv'

def get_target(X):
    return X['TARGET']

In [3]:
data_p1 = pd.read_pickle(data_p1_link)
data_p2 = pd.read_pickle(data_p2_link)
data_full = pd.concat([data_p1, data_p2], axis=0)
del data_p1, data_p2

In [7]:
nans = pd.isna(data_full).mean(axis=0)
nan_cols = data_full.loc[:, nans > 0.9].columns
to_drop = list(nan_cols) + ['x_9', 'REPORT_DT', 'ID']

In [8]:
def calc_features(X):
    return X.iloc[:, [x for x in range(20)] + [x for x in range(20, 470, 50)] + [x for x in range(-1, -50, -1)]]

def manage_categorical(data_p1):
    obj_cols = data_p1.select_dtypes('object').columns
    for col in tqdm(obj_cols):
        if len(data_p1[col].value_counts()) < 2:
            data_p1.drop([col], axis=1, inplace=True)

    obj_cols = data_p1.select_dtypes('object').columns
    data_p1 = pd.get_dummies(data_p1, obj_cols)
    data_p1.drop(obj_cols, axis=1, inplace=True)
    return data_p1

def process_data(data_p1, test=False):
    print("Dropping nans...")
    data_p1.drop(to_drop, axis=1, inplace=True)
    
    print("Optimizing memory size...")
    fcols = data_p1.select_dtypes('float').columns
    icols = data_p1.select_dtypes('integer').columns

    data_p1[fcols] = data_p1[fcols].apply(pd.to_numeric, downcast='float')
    data_p1[icols] = data_p1[icols].apply(pd.to_numeric, downcast='integer')
    
    print("Managing binary features...")
    binary = ['x_19', 'x_614', 'x_615', 'x_634']
    for col in binary:
        data_p1[col] = data_p1[col].astype('bool')
    
    data_p1.fillna(-999, inplace=True)
    if not test:
        X = data_p1.drop(['TARGET'], axis=1)
        y = data_p1.TARGET
        X = calc_features(X)
        return X, y
    else:
        return calc_features(data_p1)

In [9]:
X = process_data(pd.read_pickle(data_test_link), test=True)

Dropping nans...
Optimizing memory size...
Managing binary features...


In [11]:
X

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_7,x_10,x_11,x_12,...,x_607,x_606,x_605,x_604,x_603,x_602,x_601,x_600,x_599,x_598
1943531,1,0,1,1,1100000.0,1.577739e+05,1.577739e+05,1000000.0,120,D,...,-1.0,-1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1943532,1,0,1,1,2580000.0,2.187426e+06,2.183868e+06,2600000.0,240,D,...,0.0,0.0,44991.0,11645.0,44991.0,11645.0,44991.0,11645.0,11.0,11.0
1943533,1,0,1,1,1200000.0,3.857632e+05,2.186560e+05,1200000.0,120,D,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1943534,1,0,1,1,637500.0,5.953691e+05,5.953691e+05,637500.0,120,B1,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1943537,1,0,1,1,1232500.0,1.118118e+06,1.118118e+06,1870000.0,300,D,...,0.0,1.0,32500.0,32500.0,35000.0,32500.0,30000.0,32500.0,10.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3290365,1,0,1,1,1021000.0,9.439924e+05,9.405004e+05,1100000.0,180,D,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3290366,1,0,1,1,1546974.0,2.940406e+05,2.909023e+05,1550000.0,120,D,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3290367,1,0,1,1,800000.0,4.986006e+05,4.986006e+05,784000.0,120,B1,...,0.0,0.0,252000.0,188333.0,252000.0,450000.0,252000.0,45000.0,46.0,86.0
3290368,1,0,1,1,712000.0,1.570837e+05,1.475048e+05,960000.0,120,B1,...,-1.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


In [12]:
X, y = process_data(data_full)

Dropping nans...
Optimizing memory size...
Managing binary features...


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [14]:
from catboost import CatBoostClassifier, Pool

In [15]:
clf = Pipeline([
    ('pca', PCA(128)),
    ('logreg', LogisticRegression(random_state=SEED,  ))
])

In [16]:
clf.fit(X_train, y_train)
print('TRAIN SCORE', roc_auc_score(y_train,clf.predict_proba(X_train)[:,1]))
print('TEST SCORE', roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))

ValueError: could not convert string to float: 'B1'

In [24]:
cat_features_idxs = np.in1d(X.columns, X.select_dtypes('object').columns).nonzero()[0]

In [25]:
clf.fit(X_train, y_train, eval_set=Pool(X_test, y_test, cat_features=cat_features_idxs), cat_features=cat_features_idxs)
print('TRAIN SCORE', roc_auc_score(y_train,clf.predict_proba(X_train)[:,1]))
print('TEST SCORE', roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	test: 0.6521994	best: 0.6521994 (0)	total: 441ms	remaining: 7m 20s
1:	test: 0.6701185	best: 0.6701185 (1)	total: 964ms	remaining: 8m
2:	test: 0.6894989	best: 0.6894989 (2)	total: 1.48s	remaining: 8m 10s
3:	test: 0.6979937	best: 0.6979937 (3)	total: 1.92s	remaining: 7m 58s
4:	test: 0.7030216	best: 0.7030216 (4)	total: 2.39s	remaining: 7m 56s
5:	test: 0.7212712	best: 0.7212712 (5)	total: 2.81s	remaining: 7m 46s
6:	test: 0.7251966	best: 0.7251966 (6)	total: 3.28s	remaining: 7m 44s
7:	test: 0.7317295	best: 0.7317295 (7)	total: 3.79s	remaining: 7m 49s
8:	test: 0.7365233	best: 0.7365233 (8)	total: 4.25s	remaining: 7m 47s
9:	test: 0.7387935	best: 0.7387935 (9)	total: 4.73s	remaining: 7m 48s
10:	test: 0.7419455	best: 0.7419455 (10)	total: 5.2s	remaining: 7m 47s
11:	test: 0.7454774	best: 0.7454774 (11)	total: 5.63s	remaining: 7m 43s
12:	test: 0.7457157	best: 0.7457157 (12)	total: 6.08s	remaining: 7m 41s
13:	test: 0.7487451	best: 0.7487451 (13)	total: 6.55s	remaining: 7m 41s
14:	test: 0.74895

115:	test: 0.7860851	best: 0.7860851 (115)	total: 52.7s	remaining: 6m 41s
116:	test: 0.7861417	best: 0.7861417 (116)	total: 53.1s	remaining: 6m 40s
117:	test: 0.7863176	best: 0.7863176 (117)	total: 53.6s	remaining: 6m 40s
118:	test: 0.7864616	best: 0.7864616 (118)	total: 54s	remaining: 6m 39s
119:	test: 0.7865757	best: 0.7865757 (119)	total: 54.4s	remaining: 6m 39s
120:	test: 0.7866037	best: 0.7866037 (120)	total: 54.9s	remaining: 6m 38s
121:	test: 0.7868028	best: 0.7868028 (121)	total: 55.4s	remaining: 6m 38s
122:	test: 0.7869291	best: 0.7869291 (122)	total: 55.8s	remaining: 6m 38s
123:	test: 0.7869982	best: 0.7869982 (123)	total: 56.3s	remaining: 6m 37s
124:	test: 0.7871127	best: 0.7871127 (124)	total: 56.8s	remaining: 6m 37s
125:	test: 0.7873949	best: 0.7873949 (125)	total: 57.2s	remaining: 6m 36s
126:	test: 0.7874777	best: 0.7874777 (126)	total: 57.6s	remaining: 6m 36s
127:	test: 0.7876681	best: 0.7876681 (127)	total: 58.1s	remaining: 6m 35s
128:	test: 0.7877120	best: 0.7877120 (12

226:	test: 0.7992998	best: 0.7992998 (226)	total: 1m 42s	remaining: 5m 50s
227:	test: 0.7993636	best: 0.7993636 (227)	total: 1m 43s	remaining: 5m 49s
228:	test: 0.7993953	best: 0.7993953 (228)	total: 1m 43s	remaining: 5m 49s
229:	test: 0.7995209	best: 0.7995209 (229)	total: 1m 44s	remaining: 5m 48s
230:	test: 0.7996510	best: 0.7996510 (230)	total: 1m 44s	remaining: 5m 48s
231:	test: 0.7997274	best: 0.7997274 (231)	total: 1m 45s	remaining: 5m 47s
232:	test: 0.7997902	best: 0.7997902 (232)	total: 1m 45s	remaining: 5m 47s
233:	test: 0.8000090	best: 0.8000090 (233)	total: 1m 45s	remaining: 5m 46s
234:	test: 0.8001142	best: 0.8001142 (234)	total: 1m 46s	remaining: 5m 46s
235:	test: 0.8002359	best: 0.8002359 (235)	total: 1m 46s	remaining: 5m 45s
236:	test: 0.8003352	best: 0.8003352 (236)	total: 1m 47s	remaining: 5m 45s
237:	test: 0.8004558	best: 0.8004558 (237)	total: 1m 47s	remaining: 5m 44s
238:	test: 0.8004896	best: 0.8004896 (238)	total: 1m 48s	remaining: 5m 44s
239:	test: 0.8005824	best

336:	test: 0.8086281	best: 0.8086281 (336)	total: 2m 32s	remaining: 4m 59s
337:	test: 0.8086905	best: 0.8086905 (337)	total: 2m 32s	remaining: 4m 59s
338:	test: 0.8087612	best: 0.8087612 (338)	total: 2m 33s	remaining: 4m 58s
339:	test: 0.8088178	best: 0.8088178 (339)	total: 2m 33s	remaining: 4m 58s
340:	test: 0.8088723	best: 0.8088723 (340)	total: 2m 34s	remaining: 4m 57s
341:	test: 0.8088989	best: 0.8088989 (341)	total: 2m 34s	remaining: 4m 57s
342:	test: 0.8089618	best: 0.8089618 (342)	total: 2m 34s	remaining: 4m 56s
343:	test: 0.8090380	best: 0.8090380 (343)	total: 2m 35s	remaining: 4m 56s
344:	test: 0.8091875	best: 0.8091875 (344)	total: 2m 35s	remaining: 4m 55s
345:	test: 0.8093847	best: 0.8093847 (345)	total: 2m 36s	remaining: 4m 55s
346:	test: 0.8094304	best: 0.8094304 (346)	total: 2m 36s	remaining: 4m 54s
347:	test: 0.8095007	best: 0.8095007 (347)	total: 2m 37s	remaining: 4m 54s
348:	test: 0.8096232	best: 0.8096232 (348)	total: 2m 37s	remaining: 4m 53s
349:	test: 0.8096281	best

446:	test: 0.8161822	best: 0.8161822 (446)	total: 3m 22s	remaining: 4m 10s
447:	test: 0.8162435	best: 0.8162435 (447)	total: 3m 23s	remaining: 4m 10s
448:	test: 0.8163706	best: 0.8163706 (448)	total: 3m 23s	remaining: 4m 9s
449:	test: 0.8163220	best: 0.8163706 (448)	total: 3m 24s	remaining: 4m 9s
450:	test: 0.8163902	best: 0.8163902 (450)	total: 3m 24s	remaining: 4m 8s
451:	test: 0.8164203	best: 0.8164203 (451)	total: 3m 25s	remaining: 4m 8s
452:	test: 0.8165835	best: 0.8165835 (452)	total: 3m 25s	remaining: 4m 8s
453:	test: 0.8166304	best: 0.8166304 (453)	total: 3m 26s	remaining: 4m 7s
454:	test: 0.8166289	best: 0.8166304 (453)	total: 3m 26s	remaining: 4m 7s
455:	test: 0.8166861	best: 0.8166861 (455)	total: 3m 26s	remaining: 4m 6s
456:	test: 0.8167510	best: 0.8167510 (456)	total: 3m 27s	remaining: 4m 6s
457:	test: 0.8168509	best: 0.8168509 (457)	total: 3m 27s	remaining: 4m 6s
458:	test: 0.8169723	best: 0.8169723 (458)	total: 3m 28s	remaining: 4m 5s
459:	test: 0.8170265	best: 0.8170265

557:	test: 0.8220225	best: 0.8220353 (555)	total: 4m 14s	remaining: 3m 21s
558:	test: 0.8221186	best: 0.8221186 (558)	total: 4m 14s	remaining: 3m 21s
559:	test: 0.8221355	best: 0.8221355 (559)	total: 4m 15s	remaining: 3m 20s
560:	test: 0.8221410	best: 0.8221410 (560)	total: 4m 15s	remaining: 3m 20s
561:	test: 0.8221605	best: 0.8221605 (561)	total: 4m 16s	remaining: 3m 19s
562:	test: 0.8223418	best: 0.8223418 (562)	total: 4m 16s	remaining: 3m 19s
563:	test: 0.8223506	best: 0.8223506 (563)	total: 4m 17s	remaining: 3m 18s
564:	test: 0.8224249	best: 0.8224249 (564)	total: 4m 17s	remaining: 3m 18s
565:	test: 0.8224430	best: 0.8224430 (565)	total: 4m 17s	remaining: 3m 17s
566:	test: 0.8225221	best: 0.8225221 (566)	total: 4m 18s	remaining: 3m 17s
567:	test: 0.8225758	best: 0.8225758 (567)	total: 4m 18s	remaining: 3m 16s
568:	test: 0.8226028	best: 0.8226028 (568)	total: 4m 19s	remaining: 3m 16s
569:	test: 0.8226058	best: 0.8226058 (569)	total: 4m 19s	remaining: 3m 15s
570:	test: 0.8226663	best

667:	test: 0.8273071	best: 0.8273071 (667)	total: 5m 5s	remaining: 2m 31s
668:	test: 0.8273221	best: 0.8273221 (668)	total: 5m 5s	remaining: 2m 31s
669:	test: 0.8274057	best: 0.8274057 (669)	total: 5m 5s	remaining: 2m 30s
670:	test: 0.8274764	best: 0.8274764 (670)	total: 5m 6s	remaining: 2m 30s
671:	test: 0.8274853	best: 0.8274853 (671)	total: 5m 6s	remaining: 2m 29s
672:	test: 0.8275063	best: 0.8275063 (672)	total: 5m 7s	remaining: 2m 29s
673:	test: 0.8275162	best: 0.8275162 (673)	total: 5m 7s	remaining: 2m 28s
674:	test: 0.8275930	best: 0.8275930 (674)	total: 5m 8s	remaining: 2m 28s
675:	test: 0.8275704	best: 0.8275930 (674)	total: 5m 8s	remaining: 2m 27s
676:	test: 0.8276855	best: 0.8276855 (676)	total: 5m 9s	remaining: 2m 27s
677:	test: 0.8277177	best: 0.8277177 (677)	total: 5m 9s	remaining: 2m 27s
678:	test: 0.8278703	best: 0.8278703 (678)	total: 5m 10s	remaining: 2m 26s
679:	test: 0.8279204	best: 0.8279204 (679)	total: 5m 10s	remaining: 2m 26s
680:	test: 0.8279449	best: 0.8279449

777:	test: 0.8319827	best: 0.8319827 (777)	total: 6m	remaining: 1m 42s
778:	test: 0.8320336	best: 0.8320336 (778)	total: 6m	remaining: 1m 42s
779:	test: 0.8320988	best: 0.8320988 (779)	total: 6m 1s	remaining: 1m 41s
780:	test: 0.8320911	best: 0.8320988 (779)	total: 6m 1s	remaining: 1m 41s
781:	test: 0.8321350	best: 0.8321350 (781)	total: 6m 2s	remaining: 1m 40s
782:	test: 0.8321308	best: 0.8321350 (781)	total: 6m 2s	remaining: 1m 40s
783:	test: 0.8321468	best: 0.8321468 (783)	total: 6m 3s	remaining: 1m 40s
784:	test: 0.8321597	best: 0.8321597 (784)	total: 6m 3s	remaining: 1m 39s
785:	test: 0.8322228	best: 0.8322228 (785)	total: 6m 4s	remaining: 1m 39s
786:	test: 0.8322716	best: 0.8322716 (786)	total: 6m 4s	remaining: 1m 38s
787:	test: 0.8322849	best: 0.8322849 (787)	total: 6m 5s	remaining: 1m 38s
788:	test: 0.8323494	best: 0.8323494 (788)	total: 6m 5s	remaining: 1m 37s
789:	test: 0.8323799	best: 0.8323799 (789)	total: 6m 6s	remaining: 1m 37s
790:	test: 0.8323660	best: 0.8323799 (789)	t

888:	test: 0.8362747	best: 0.8362747 (888)	total: 6m 52s	remaining: 51.5s
889:	test: 0.8363350	best: 0.8363350 (889)	total: 6m 52s	remaining: 51s
890:	test: 0.8363430	best: 0.8363430 (890)	total: 6m 53s	remaining: 50.6s
891:	test: 0.8364039	best: 0.8364039 (891)	total: 6m 53s	remaining: 50.1s
892:	test: 0.8364033	best: 0.8364039 (891)	total: 6m 54s	remaining: 49.6s
893:	test: 0.8364585	best: 0.8364585 (893)	total: 6m 54s	remaining: 49.2s
894:	test: 0.8365383	best: 0.8365383 (894)	total: 6m 55s	remaining: 48.7s
895:	test: 0.8364917	best: 0.8365383 (894)	total: 6m 55s	remaining: 48.2s
896:	test: 0.8365257	best: 0.8365383 (894)	total: 6m 56s	remaining: 47.8s
897:	test: 0.8365479	best: 0.8365479 (897)	total: 6m 56s	remaining: 47.3s
898:	test: 0.8366583	best: 0.8366583 (898)	total: 6m 56s	remaining: 46.8s
899:	test: 0.8367042	best: 0.8367042 (899)	total: 6m 57s	remaining: 46.4s
900:	test: 0.8367513	best: 0.8367513 (900)	total: 6m 57s	remaining: 45.9s
901:	test: 0.8367818	best: 0.8367818 (90

TRAIN SCORE 0.8995115043018479
TEST SCORE 0.8396832610737756


In [42]:
clf

<catboost.core.CatBoostClassifier at 0x7f46ad5cfdc0>

In [31]:
def make_submission(clf, process_data, model_name='baseline'):
    
    print("Reading test data...")
    # Creating submission file
    data_test = pd.read_pickle(data_test_link)
    
    print("Preprocessing test data...")
    X = process_data(data_test)
    submission = pd.read_csv(submission_link)

    print("Making predictions...")
    submission['Probability'] = clf.predict_proba(X)[:,1]
    
    
    submission_folder = f'submissions/{model_name}'
    os.mkdir(submission_folder)
    
    # Saving submission
    print("Saving submission...")
    submission.to_csv(os.path.join(submission_folder, f"submission_{model_name}.csv"),index=False)
    
    
    print("Saving model...")
    # Saving the model
    with open(os.path.join(submission_folder, f'model_{model_name}.pkl'), 'wb') as files:
        pickle.dump(clf, files)
    
    
    print("Saving features...")
    # Saving features
    with open(os.path.join(submission_folder, f'features_{model_name}.pkl'), 'wb') as files:
        pickle.dump(X, files)  
    imports = list(set(get_imports()))

    
    print("Saving requirements...")
    # Updating requirements
    requirements = []
    for m in pkg_resources.working_set:
        if m.project_name in imports and m.project_name!="pip":
            requirements.append((m.project_name, m.version))
        
    with open(os.path.join(submission_folder, 'requirements.txt'), 'w') as f:
        for r in requirements:
            f.write("{}=={} \n".format(*r))
    
    print("Submission saved!")

In [35]:
del X_train, X_test, y_train, y_test
del X, y, data_p1

NameError: name 'X' is not defined

In [37]:
make_submission(clf, lambda x: process_data(x, test=True), model_name='catboost_v0')

Reading test data...
Preprocessing test data...
Dropping nans...
Optimizing memory size...
Managing binary features...
Making predictions...
Saving submission...
Saving model...
Saving features...
Saving requirements...
Submission saved!
