# IEEE-CIS Fraud Detection -- Sampling

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
%%time
train_features = pd.read_csv('./Data/train_features.csv' )
test_features = pd.read_csv('./Data/test_features.csv')
train_target = pd.read_csv('./Data/train_target.csv', header=None)

CPU times: user 31.6 s, sys: 4.14 s, total: 35.8 s
Wall time: 39 s


In [7]:
train_features.shape

(590540, 151)

In [8]:
train_target.shape

(590540, 1)

In [49]:
sum(train_target.values == 0)[0]

569877

## Oversampling using KmeansSMOTE

In [55]:
from imblearn.over_sampling import KMeansSMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [52]:
print("Before OverSampling, counts of label '1': {}".format(sum(train_target.values == 1)[0]))
print("Before OverSampling, counts of label '0': {} \n".format(sum(train_target.values == 0)[0]))

sm = KMeansSMOTE(random_state=0, 
                 sampling_strategy=0.15,  
                 k_neighbors=10,
                 cluster_balance_threshold=0.02, 
                 n_jobs=4)
X_train, y_train = sm.fit_sample(train_features, train_target)

X_train = pd.DataFrame(X_train)
X_train.columns = train_features.columns
y_train = pd.DataFrame(y_train)

print('After OverSampling, the shape of X_train: {}'.format(X_train.shape))
print('After OverSampling, the shape of y_train: {} \n'.format(y_train.shape))

Before OverSampling, counts of label '1': 20663
Before OverSampling, counts of label '0': 569877 

After OverSampling, the shape of X_train: (655362, 151)
After OverSampling, the shape of y_train: (655362, 1) 



In [54]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')

In [59]:
params = {
    'bagging_fraction': 0.6, 
    'bagging_seed': 11, 
    'boosting_type': 'gbdt', 
    'colsample_bytree': 0.4077306238013061, 
    'feature_fraction': 0.9, 
    'gamma': 0.3094147774766888, 
    'learning_rate': 0.11890674333969257, 
    'max_depth': 18, 
    'metric': 'auc', 
    'min_child_samples': 97, 
    'num_leaves': 280, 
    'objective': 'binary', 
    'random_state': 0, 
    'reg_alpha': 0.5478265344202043, 
    'reg_lambda': 0.5895122262747918
}

In [60]:
%%time
score_mean = 0
EPOCHS = 5

y_preds = np.zeros(sample_submission.shape[0])

kf = StratifiedKFold(n_splits=EPOCHS, random_state=0, shuffle=True)
y_oof = np.zeros(X_train.shape[0])
gc.collect()

for tr_idx, val_idx in kf.split(X_train, y_train):
    clf = lgb.LGBMClassifier(**params)
    
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    score = roc_auc_score(y_vl, y_pred_train)
    score_mean += score
    print(f'AUC: {score}')
    y_preds += clf.predict_proba(test_features)[:,1] / EPOCHS
    del clf
    gc.collect()
print(f'Mean AUC: {score_mean / EPOCHS} \n')
gc.collect()

AUC: 0.9907136060253378
AUC: 0.9914512627195954
AUC: 0.9900456334572294
AUC: 0.9904544472212236
AUC: 0.9906513993808189
Mean AUC: 0.9906632697608411 

CPU times: user 19min 34s, sys: 1min 47s, total: 21min 22s
Wall time: 8min 27s


0

In [236]:
sample_submission1a = sample_submission.copy()
sample_submission1a['isFraud'] = y_preds
sample_submission1a.to_csv('./Output/lgb_oversample.csv', index=False)

## Undersampling

In [209]:
train_target.columns = ['isFraud']

In [210]:
train = pd.concat([train_target, train_features], axis=1)

In [234]:
%%time
EPOCHS = 4
TIMES = 24
frac_inc = np.arange(0,TIMES,1)/400 + 0.1
print(frac_inc)

y_preds = np.zeros(sample_submission.shape[0])

for i in range (TIMES):
    kf = StratifiedKFold(n_splits=EPOCHS, random_state=i, shuffle=True)
    
    X_train = train[train['isFraud'] == 0].sample(frac = frac_inc[i]).append(train[train['isFraud'] == 1])
    y_train = train_target.iloc[X_train.index]['isFraud']
    X_train = X_train.drop('isFraud', axis=1)
    
    y_oof = np.zeros(X_train.shape[0])
    gc.collect()
    print(f"Start training: [fraction: {frac_inc[i]}]")
    for tr_idx, val_idx in kf.split(X_train, y_train):
        clf = lgb.LGBMClassifier(**params)
    
        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        clf.fit(X_tr, y_tr)
        y_pred_train = clf.predict_proba(X_vl)[:,1]
        #y_oof[val_idx] = y_pred_train
        print('ROC AUC {}'.format(roc_auc_score(y_vl, y_pred_train)))
        y_oof[val_idx] = y_pred_train    
        y_preds += clf.predict_proba(test_features)[:,1] / EPOCHS   
    print('ROC AUC oof {}'.format(roc_auc_score(y_train, y_oof))) 

[0.1    0.1025 0.105  0.1075 0.11   0.1125 0.115  0.1175 0.12   0.1225
 0.125  0.1275 0.13   0.1325 0.135  0.1375 0.14   0.1425 0.145  0.1475
 0.15   0.1525 0.155  0.1575]
Start training: [fraction: 0.1]
ROC AUC 0.9526943355246104
ROC AUC 0.954508547431833
ROC AUC 0.9549319169855457
ROC AUC 0.9522231170965088
ROC AUC oof 0.9535918586768876
Start training: [fraction: 0.10250000000000001]
ROC AUC 0.9549133394993667
ROC AUC 0.9535543359757561
ROC AUC 0.9558618529611794
ROC AUC 0.9538485077029685
ROC AUC oof 0.9545289026903727
Start training: [fraction: 0.10500000000000001]
ROC AUC 0.9519505492514818
ROC AUC 0.9563891982258281
ROC AUC 0.9536050363702857
ROC AUC 0.9556324686637829
ROC AUC oof 0.9543769219742348
Start training: [fraction: 0.10750000000000001]
ROC AUC 0.953870188256037
ROC AUC 0.95301231549233
ROC AUC 0.952770323654918
ROC AUC 0.9568381446860208
ROC AUC oof 0.9541003640265109
Start training: [fraction: 0.11]
ROC AUC 0.950360956519213
ROC AUC 0.955104232685658
ROC AUC 0.951130

In [237]:
sample_submission1 = sample_submission.copy()
sample_submission1['isFraud'] = y_preds/TIMES
sample_submission1.to_csv('./Output/lgb_undersample.csv', index=False)

## Reference
- https://www.kaggle.com/smerllo/identify-unique-cards-id