In [1]:
%reset
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
import lightgbm as lgb
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
import multiprocessing
import gc

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train_transaction = pd.read_csv('train_transaction.csv')
test_transaction = pd.read_csv('test_transaction.csv')
train_identity = pd.read_csv('train_identity.csv')
test_identity = pd.read_csv('test_identity.csv')
sample_submission = pd.read_csv('sample_submission.csv')
# HERE DO THE MEMORY REDUCTION OPERATION
train_transaction = reduce_mem_usage(train_transaction)
test_transaction = reduce_mem_usage(test_transaction)
train_identity = reduce_mem_usage(train_identity)
test_identity = reduce_mem_usage(test_identity)

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)


In [4]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction,test_identity,on='TransactionID',how='left')
del train_transaction,test_transaction,train_identity,test_identity
gc.collect()

28

# Delete Columns such that: <br>
(1) a column null value > 90% of entries <br>
(2) a column with only unique value <br>
(3) a column with top feature appeared 90% of the time

In [5]:
one_value_cols = [col for col in train.columns if train[col].nunique()<=1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <=1]

many_null_cols = [col for col in train.columns if train[col].isnull().sum()/ train.shape[0]>0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] \
                       > 0.9]
big_top_value_cols = [col for col in train.columns if train[col].value_counts( \
                                                dropna=False,normalize= True).values[0] >0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts( \
                                                dropna=False,normalize=True).values[0] > 0.9]
cols_to_drop = list(set(one_value_cols + one_value_cols_test + many_null_cols \
                        + many_null_cols_test + big_top_value_cols + big_top_value_cols_test))
try:
    cols_to_drop.remove('isFraud')
    print("{} features are going to be dropped for being useless".format(len(cols_to_drop)))
except:
    pass
try:
    train = train.drop(cols_to_drop,axis =1)
except:
    pass
try:
    test = test.drop(cols_to_drop,axis =1)
except:
    pass


82 features are going to be dropped for being useless


In [None]:
for col in tqdm_notebook(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))

HBox(children=(IntProgress(value=0, max=352), HTML(value='')))

In [33]:
del test
gc.collect()

21

In [34]:
train = reduce_mem_usage(train)

Mem. usage decreased to 426.33 Mb (73.2% reduction)


In [35]:
train.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=352)

In [36]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

del train
gc.collect()

7

In [37]:
# RFECV does not support NaNs
X.fillna(-999, inplace=True)

In [38]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [39]:
clf = lgb.LGBMClassifier(**params)
rfe = RFECV(estimator=clf, step=10, cv=KFold(n_splits=5, \
            shuffle=False), scoring='roc_auc', verbose=2)

In [40]:
rfe.fit(X, y)

Fitting estimator with 349 features.
Fitting estimator with 339 features.
Fitting estimator with 329 features.
Fitting estimator with 319 features.
Fitting estimator with 309 features.
Fitting estimator with 299 features.
Fitting estimator with 289 features.
Fitting estimator with 279 features.
Fitting estimator with 269 features.
Fitting estimator with 259 features.
Fitting estimator with 249 features.
Fitting estimator with 239 features.
Fitting estimator with 229 features.
Fitting estimator with 219 features.
Fitting estimator with 209 features.
Fitting estimator with 199 features.
Fitting estimator with 189 features.
Fitting estimator with 179 features.
Fitting estimator with 169 features.
Fitting estimator with 159 features.
Fitting estimator with 149 features.
Fitting estimator with 139 features.
Fitting estimator with 129 features.
Fitting estimator with 119 features.
Fitting estimator with 109 features.
Fitting estimator with 99 features.
Fitting estimator with 89 features.
Fit

RFECV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
   estimator=LGBMClassifier(bagging_fraction=0.4181193142567742, bagging_seed=11,
        boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        feature_fraction=0.3797454081646243, importance_type='split',
        learning_rate=0.006883242363721497, max_depth=-1, metric='auc',
        ...       silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0, verbosity=-1),
   n_jobs=1, scoring='roc_auc', step=10, verbose=2)

In [43]:
for col in X.columns[rfe.ranking_ ==1 ]:
    print(col)

TransactionAmt
ProductCD
card1
card2
card3
card4
card5
card6
addr1
dist1
P_emaildomain
R_emaildomain
C1
C2
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
D1
D2
D3
D4
D5
D6
D8
D9
D10
D11
D12
D13
D14
D15
M2
M3
M4
M5
M6
M8
M9
V4
V5
V12
V13
V19
V20
V30
V34
V35
V36
V37
V38
V44
V45
V47
V53
V54
V56
V57
V58
V61
V62
V70
V74
V75
V76
V78
V82
V83
V87
V91
V94
V96
V97
V99
V126
V127
V128
V130
V131
V139
V143
V149
V152
V160
V165
V170
V187
V189
V201
V203
V204
V207
V208
V209
V210
V212
V217
V221
V222
V234
V257
V258
V261
V264
V265
V266
V267
V268
V271
V274
V275
V277
V278
V279
V280
V282
V283
V285
V287
V289
V291
V292
V294
V306
V307
V308
V310
V312
V313
V314
V315
V317
V323
V324
V332
V333
id_01
id_02
id_05
id_06
id_09
id_13
id_14
id_17
id_19
id_20
id_30
id_31
id_33
id_38
DeviceType
DeviceInfo


In [44]:
X.columns[1]

'ProductCD'

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,68.5,4,13926,-999.0,150.0,1,142.0,1,315.0,87.0,...,136,-999.0,461,4,2,2,2,2,2,2740
1,29.0,4,2755,404.0,150.0,2,102.0,1,325.0,87.0,...,136,-999.0,461,4,2,2,2,2,2,2740
2,59.0,4,4663,490.0,150.0,4,166.0,2,330.0,87.0,...,136,-999.0,461,4,2,2,2,2,2,2740
3,50.0,4,18132,567.0,150.0,2,117.0,2,476.0,87.0,...,136,-999.0,461,4,2,2,2,2,2,2740
4,50.0,1,4497,514.0,150.0,2,102.0,1,420.0,87.0,...,162,32.0,268,3,1,0,1,1,1,1565


In [46]:
# use pickle to save the selected features
import pickle
selected_features = [col for col in X.columns[rfe.ranking_ ==1] ]

In [51]:
with open('slected_features.pkl','wb') as f:
    pickle.dump(selected_features,f)