In [22]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd
from umap.umap_ import UMAP
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import time
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
warnings.simplefilter('ignore', UserWarning)

In [2]:
# reduce memory
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [3]:
train_df = reduce_mem(pd.read_csv('train.csv'))
# fillna with median

feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
X, y = train_df[feats], train_df['TARGET']
for col in tqdm(X, total=len(X.columns)):
    X[col] = X[col].replace([np.inf, -np.inf], np.nan) 
    np_array = X[col].to_numpy()
    X[col] = X[col].fillna(0)
    
scaler = MinMaxScaler()
scaler.fit(X)
x = scaler.transform(X)
X = pd.DataFrame(x, columns = X.columns)
del train_df

-- Mem. usage decreased to 521.42 Mb (72.1% reduction),time spend:1.23 min


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].replace([np.inf, -np.inf], np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(0)
100%|██████████| 795/795 [00:02<00:00, 286.31it/s]


In [27]:
params_grid_pipeline = {
    "max_depth": [13],
    "min_samples_leaf": [40],
    #"max_leaf_nodes": [30, 35, 40, 45]
}
rf = RandomForestClassifier(verbose=1)
clf_pipeline = GridSearchCV(rf, params_grid_pipeline, n_jobs=10, scoring='roc_auc', verbose=1, cv=10)
clf_pipeline.fit(X, y)
print('Best params:', clf_pipeline.best_params_, 'Best AUC:', clf_pipeline.best_score_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 out of  40 | elapsed:  6.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best params: {'max_depth': 13, 'max_leaf_nodes': 45, 'min_samples_leaf': 40} Best AUC: 0.7254760923256304


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.3min finished


In [28]:
rf = RandomForestClassifier(verbose=1,  min_samples_leaf=40, max_depth=13)
cv_results = cross_validate(rf, X, y, cv=10, scoring='roc_auc', return_train_score=True, n_jobs=10, verbose=1)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:  3.0min remaining: 12.1min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  3.1min finished


In [29]:
print(cv_results)

{'fit_time': array([170.91080499, 170.91193986, 174.6656611 , 172.51379371,
       170.53094673, 167.67706513, 175.51822948, 174.0527389 ,
       176.18240047, 175.21322179]), 'score_time': array([1.04086399, 1.04108572, 1.11758637, 0.98042202, 0.97506738,
       0.9072175 , 1.19641209, 0.99444723, 1.25192237, 1.17646861]), 'test_score': array([0.75307526, 0.74825046, 0.74281329, 0.75077893, 0.74319238,
       0.74241866, 0.75069893, 0.74809326, 0.74876047, 0.74692632]), 'train_score': array([0.8583123 , 0.8588043 , 0.8575741 , 0.85712793, 0.85851475,
       0.85688505, 0.85893629, 0.85781866, 0.85702588, 0.85842494])}


In [32]:
print(np.array(cv_results['test_score']).mean())

0.7475007975624574
