In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd
from umap.umap_ import UMAP
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import time
import warnings
warnings.simplefilter('ignore', UserWarning)

In [2]:
# reduce memory
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [3]:
train_df = reduce_mem(pd.read_csv('train.csv'))
train_df = train_df.sample(int(len(train_df)*0.3))
# fillna with median

feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
X, y = train_df[feats], train_df['TARGET']
for col in tqdm(X, total=len(X.columns)):
    X[col] = X[col].replace([np.inf, -np.inf], np.nan) 
    np_array = X[col].to_numpy()
    X[col] = X[col].fillna(0)
    
scaler = MinMaxScaler()
scaler.fit(X)
x = scaler.transform(X)
X = pd.DataFrame(x, columns = X.columns)
del train_df

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2,
)

-- Mem. usage decreased to 787.11 Mb (58.0% reduction),time spend:1.22 min


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].replace([np.inf, -np.inf], np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(0)
100%|██████████| 795/795 [00:00<00:00, 1717.73it/s]


In [4]:
print('Num train data:',len(X_train))

Num train data: 73801


In [5]:
clf = LGBMClassifier(
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)
print(
    "AUC on the test set with raw data: {:.3f}".format(roc_auc_score(y_valid, clf.predict_proba(X_valid, num_iteration=clf.best_iteration_)[:, 1]))
)

[200]	training's auc: 0.828537	training's binary_logloss: 0.227545	valid_1's auc: 0.768341	valid_1's binary_logloss: 0.248565
[400]	training's auc: 0.866747	training's binary_logloss: 0.210678	valid_1's auc: 0.773631	valid_1's binary_logloss: 0.246154
[600]	training's auc: 0.891425	training's binary_logloss: 0.198987	valid_1's auc: 0.77574	valid_1's binary_logloss: 0.245537
[800]	training's auc: 0.910579	training's binary_logloss: 0.189335	valid_1's auc: 0.776067	valid_1's binary_logloss: 0.245302
[1000]	training's auc: 0.926381	training's binary_logloss: 0.180267	valid_1's auc: 0.776945	valid_1's binary_logloss: 0.245118
AUC on the test set with raw data: 0.777


In [11]:
umap = UMAP(low_memory=True, n_jobs=-1, metric='hamming')
lgbm = clf = LGBMClassifier(
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )
pipeline = Pipeline([("umap", umap), ("lgbm", lgbm)])
params_grid_pipeline = {
    "umap__n_neighbors": [5, 8, 12, 20, 25],
    "umap__n_components": [50, 75, 100, 200, 300],
    #"umap__metric": ["euclidean", "manhattan", "chebyshev", 'minkowski', "canberra", "braycurtis", "mahalanobis", "wminkowski", "hellinger", "seuclidean", "cosine", "sokalmichener", "jaccard" ,"hamming"]
}

clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline, n_jobs=8, scoring='roc_auc', verbose=1)
clf_pipeline.fit(X_train, y_train)
print('Best params:', clf_pipeline.best_params_, 'Best AUC:', clf_pipeline.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 73.7min
[Parallel(n_jobs=8)]: Done 125 out of 125 | elapsed: 414.0min finished


Best params: {'umap__n_components': 200, 'umap__n_neighbors': 5} Best AUC: 0.5382312059407495


In [9]:
clf_pipeline.cv_results_

{'mean_fit_time': array([158.18233433, 153.08295708, 155.86381807, 125.91379533]),
 'std_fit_time': array([ 8.9096794 ,  7.12616972, 13.35945777, 21.41018217]),
 'mean_score_time': array([718.49754596, 724.62426105, 725.25295849, 516.45084367]),
 'std_score_time': array([  2.90513694,  12.44778434,   3.11154637, 116.97013018]),
 'param_umap__n_components': masked_array(data=[20, 25, 30, 45],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'umap__n_components': 20},
  {'umap__n_components': 25},
  {'umap__n_components': 30},
  {'umap__n_components': 45}],
 'split0_test_score': array([0.50716228, 0.51772406, 0.50833215, 0.50669489]),
 'split1_test_score': array([0.51110058, 0.52147995, 0.51396434, 0.50478204]),
 'split2_test_score': array([0.52395165, 0.49916843, 0.52774932, 0.51958577]),
 'split3_test_score': array([0.50876129, 0.52486626, 0.51855189, 0.50756025]),
 'split4_test_score': array([0.52147675, 0.51288781, 0.50

In [12]:
clf_pipeline.cv_results_

{'mean_fit_time': array([321.14483647, 260.86814013, 265.33737822, 287.48063354,
        316.51656981, 337.04834895, 277.51453252, 297.34527078,
        285.76398306]),
 'std_fit_time': array([27.7970851 , 10.78746964, 17.00456529, 49.41398917, 12.40104568,
        20.93502246, 35.73049686, 25.60514184, 29.89160006]),
 'mean_score_time': array([104.58012695, 105.96834245, 142.40974455, 126.57583685,
        135.2874083 , 185.27865949, 126.98369012, 152.4379025 ,
        168.76102762]),
 'std_score_time': array([20.97290502,  8.51240244, 11.34233769, 24.27628864, 20.4023182 ,
        19.0090131 , 18.92411062,  6.83833047, 20.67780607]),
 'param_umap__n_components': masked_array(data=[8, 8, 8, 12, 12, 12, 15, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_umap__n_neighbors': masked_array(data=[5, 8, 12, 5, 8, 12, 5, 8, 12],
              mask=[False, False, False,