In [1]:
import pandas as pd
import numpy as np

from scipy.stats import norm
from scipy import stats
from scipy.stats import skew
from scipy.stats.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold,train_test_split

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb

import os
import gc
import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reduce_memory
def reduce_memory(df):
    print("Reduce_memory...");
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    return df

In [3]:
def make_day_feature(df, offset=0, tname='TransactionDT'):
    """
    Creates a day of the week feature, encoded as 0-6. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    offset : float (default=0)
        offset (in days) to shift the start/end of a day.
    tname : str
        Name of the time column in df.
    """
    # found a good offset is 0.58
    days = df[tname] / (3600*24)        
    encoded_days = np.floor(days-1+offset) % 7
    return encoded_days

def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    tname : str
        Name of the time column in df.
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [None]:
# Load Data
train_identity = pd.read_csv('train_identity.csv',index_col='TransactionID')
train_transaction = pd.read_csv('train_transaction.csv',index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv',index_col='TransactionID')
test_transaction = pd.read_csv('test_transaction.csv',index_col='TransactionID')

# Create train and test dataset by left outer join
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

# Delete variables to save memory
del train_identity,train_transaction,test_identity,test_transaction

y=train['isFraud'].astype('uint8')
train.drop(['isFraud'], axis=1, inplace=True)

# The column of "TransactionDT" is essentially a measure of time. It was found that the hours have some correlation with the fraud
# 0.58 is recommended by a kaggle kernel to fit the meaning of transactional day.
train['hours'] = make_hour_feature(train)
test['hours'] = make_hour_feature(test)


train.drop(['TransactionDT'], axis=1, inplace=True)
test.drop(['TransactionDT'], axis=1, inplace=True)

In [None]:
# Get names of domains and countries from raw email data
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 
          'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 
          'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 
          'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 
          'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 
          'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 
          'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 
          'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other',
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other',
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    # Domain
    train[c + '_bin'] = train[c].map(emails)
    test[c + '_bin'] = test[c].map(emails)
    
    # Country
    train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
    test[c + '_suffix'] = test[c].map(lambda x: str(x).split('.')[-1])
    train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test[c + '_suffix'] = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [None]:
labels = {np.nan: 0, 'nan': 0}
for c1, c2 in train.dtypes.reset_index().values:
    if c2=='O':
        for c in list(set(train[c1].unique())|set(test[c1].unique())):
            if c not in labels:
                labels[c] = len(labels) - 1

for c1, c2 in train.dtypes.reset_index().values:
    if c2=='O':
        train[c1] = train[c1].map(lambda x: labels[str(x)])
        test[c1] = test[c1].map(lambda x: labels[str(x)])

In [None]:
# According to kaggel kernels, recommend dropping the following columns
# Get duplicate columns
duplicates = []
cols = train.columns
i = 0
for c1 in cols:
    i += 1
    for c2 in cols[i:]:
        if c1 != c2:
            if (np.sum((train[c1].values == train[c2].values).astype(int)) / len(train))>0.95:
                duplicates.append(c2)
                print(c1, c2, np.sum((train[c1].values == train[c2].values).astype(int)) / len(train))

duplicates = list(set(duplicates))
print(duplicates)
drop_col = duplicates


# Explicitly list drop_col to save time
# drop_col = ['V300', 'V309', 'V111', 'C3', 'V124', 'V106', 
#             'V125', 'V315', 'V134', 'V102', 'V123', 'V316', 'V113', 'V136', 
#             'V305', 'V110', 'V299', 'V289', 'V286', 'V318', 'V103', 'V304',
#             'V116', 'V298', 'V284', 'V293', 'V137', 'V295', 'V301', 'V104', 
#             'V311', 'V115', 'V109', 'V119', 'V321', 'V114', 'V133', 'V122',
#             'V319', 'V105', 'V112', 'V118', 'V117', 'V121', 'V108', 'V135',
#             'V320', 'V303', 'V297', 'V120']

In [None]:
train.drop(drop_col , axis=1, inplace=True)
test.drop(drop_col , axis=1, inplace=True)

train_size = train.shape[0]
test_size = test.shape[0]

print('Max NA counts in train dataset is',train.isnull().sum().max())
print('Max NA counts in test dataset is',test.isnull().sum().max())

# Decision tree method dose not require feature scaling.
# Label Encoding qualitative features (using labels shown above to encode for now)
# for c in train.columns:
#         if train[c].dtype=='object': 
#             lbl = preprocessing.LabelEncoder()
#             lbl.fit(list(train[c].values)+list(test[c].values))
#             train[c] = lbl.transform(list(train[c].values))
#             test[c] = lbl.transform(list(test[c].values))

# Fill missing values after label encoding.
# The values in the orginal datasets are all positive, so fill NA with a large negative number
train = train.fillna(-999)
test = test.fillna(-999)

print('NA counts in train dataset now becomes',train.isnull().sum().max())
print('NA counts in test dataset now becomes',test.isnull().sum().max())

In [None]:
# Reducing memory by change the dtypes of some columns
train= reduce_memory(train)
test= reduce_memory(test)

In [None]:
xgb_path = './xgb_models_stack/'
lgb_path = './lgb_models_stack/'

# Create dir for models
# os.mkdir(xgb_path)
# os.mkdir(lgb_path)

#XGBoost Model
def fit_xgb(X_fit, y_fit, X_val, y_val, counter, xgb_path, name):
    model = xgb.XGBClassifier(n_estimators=1000, max_depth=9, learning_rate=0.02, subsample=0.7, 
                              colsample_bytree=0.7,missing=-999,tree_method='hist')
    model.fit(X_fit, y_fit,eval_set=[(X_val, y_val)],verbose=0,eval_metric="auc",early_stopping_rounds=100)
    cv_val = model.predict_proba(X_val)[:,1]
    #Save XGBoost Model
    save_to = '{}{}_fold{}.dat'.format(xgb_path, name, counter+1)
    pickle.dump(model, open(save_to, "wb"))
    del X_fit, y_fit, X_val, y_val
    return cv_val

#LightGBM Model
def fit_lgb(X_fit, y_fit, X_val, y_val, counter, lgb_path, name):
    model = lgb.LGBMClassifier(learning_rate=0.02,max_depth=9, boosting_type='gbdt',
                               objective= 'binary', metric='auc', seed= 4, num_iterations= 2000,
                               num_leaves= 64, feature_fraction= 0.4,
                               bagging_fraction= 0.4, bagging_freq= 5)
    model.fit(X_fit, y_fit,eval_set=[(X_val, y_val)],verbose=200,early_stopping_rounds=100)
    cv_val = model.predict_proba(X_val)[:,1]
    #Save LightGBM Model
    save_to = '{}{}_fold{}.txt'.format(lgb_path, name, counter+1)
    model.booster_.save_model(save_to)
    del X_fit, y_fit, X_val, y_val
    return cv_val

In [None]:
# Create train and validation datasets from original train dataset
X_train_, X_val_, y_train_, y_val_ = train_test_split(train, y, test_size=0.1, random_state=42)
NumFold=5
skf = StratifiedKFold(n_splits=NumFold, shuffle=True, random_state=42)
# del train,y

In [None]:
%%time
xgb_cv_result = np.zeros(X_train_.shape[0])
print('\nModel Fitting...')
for counter, (tr_idx, val_idx) in enumerate(skf.split(X_train_, y_train_)):
    print('\nFold {}'.format(counter+1))
    X_fit, y_fit = X_train_.iloc[tr_idx,:], y_train_.iloc[tr_idx]
    X_val, y_val = X_train_.iloc[val_idx,:], y_train_.iloc[val_idx]

    print('XGBoost')
    xgb_cv_result[val_idx] = fit_xgb(X_fit, y_fit, X_val, y_val, counter, lgb_path , name='xgb')

    del X_fit, X_val, y_fit, y_val
    # Free meomory by running garbarge collector
    gc.collect()

from sklearn.metrics import roc_auc_score
auc_xgb  = round(roc_auc_score(y_train_, xgb_cv_result),4)
print('\nXGBoost  VAL AUC: {}'.format(auc_xgb))

In [None]:
%%time
lgb_cv_result = np.zeros(X_train_.shape[0])
for counter, (tr_idx, val_idx) in enumerate(skf.split(X_train_, y_train_)):
    print('\nFold {}'.format(counter+1))
    X_fit, y_fit = X_train_.iloc[tr_idx,:], y_train_.iloc[tr_idx]
    X_val, y_val = X_train_.iloc[val_idx,:], y_train_.iloc[val_idx]

    print('LigthGBM')
    lgb_cv_result[val_idx] = fit_lgb(X_fit, y_fit, X_val, y_val, counter, lgb_path, name='lgb')

    del X_fit, X_val, y_fit, y_val
    # Free meomory by running garbarge collector
    gc.collect()

from sklearn.metrics import roc_auc_score
auc_lgb  = round(roc_auc_score(y_train_, lgb_cv_result),4)
print('\nLGBoost  TRAIN AUC: {}'.format(auc_lgb))

In [None]:
%%time
xgb_models = sorted(os.listdir(xgb_path))
xgb_result_val = np.zeros(X_val_.shape[0])
xgb_result_test = np.zeros(test.shape[0])

print('With XGBoost...')    
for m_name in xgb_models:
    #Load Xgboost Model
    model = pickle.load(open('{}{}'.format(xgb_path, m_name), "rb"))
    xgb_result_val += model.predict_proba(X_val_)[:,1]
    xgb_result_test += model.predict_proba(test)[:,1]
del model
xgb_result_val /= len(xgb_models)
xgb_result_test /= len(xgb_models)

auc_xgb  = round(roc_auc_score(y_val_, xgb_result_val),4)
print('\nXGBoost  VAL AUC: {}'.format(auc_xgb))

In [None]:
%%time
from sklearn.metrics import roc_auc_score
lgb_models = sorted(os.listdir(lgb_path))
lgb_result_val = np.zeros(X_val_.shape[0])
lgb_result_test = np.zeros(test.shape[0])

print('With LightGBM...')   
for m_name in lgb_models:
    #Load LightGBM Model
    model = lgb.Booster(model_file='{}{}'.format(lgb_path, m_name))
    lgb_result_val += model.predict(X_val_)
    lgb_result_test += model.predict(test)
del model
lgb_result_val /= len(lgb_models)
lgb_result_test /= len(lgb_models)

auc_lgb  = round(roc_auc_score(y_val_, lgb_result_val),4)
print('\nLGBoost  VAL AUC: {}'.format(auc_lgb))

In [None]:
# Submitting results
submission = pd.read_csv('sample_submission.csv', index_col='TransactionID')
submission['isFraud'] = lgb_result_test
submission.to_csv('lgb_finer_submission.csv')