## Importing necessary libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
# print(os.listdir("../input"))

# Importing train datasets

In [2]:
## REducing memory
# df_trans = reduce_mem_usage(df_trans)
# df_id = reduce_mem_usage(df_id)

### Modelling

In [None]:
df_trans = pd.read_csv('data/train_transaction.csv')
df_test_trans = pd.read_csv('data/test_transaction.csv')

df_id = pd.read_csv('data/train_identity.csv')
df_test_id = pd.read_csv('data/test_identity.csv')

sample_submission = pd.read_csv('data/sample_submission.csv', index_col='TransactionID')

df_train = df_trans.merge(df_id, how='left', left_index=True, right_index=True, on='TransactionID')
df_test = df_test_trans.merge(df_test_id, how='left', left_index=True, right_index=True, on='TransactionID')

print(df_train.shape)
print(df_test.shape)

# y_train = df_train['isFraud'].copy()
del df_trans, df_id, df_test_trans, df_test_id


# reducing memory usage

In [None]:
# df_train = reduce_mem_usage(df_train)
# df_test = reduce_mem_usage(df_test)

# Mapping emails

In [None]:

# emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
#           'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
#           'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
#           'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
#           'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
#           'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
#           'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
#           'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
#           'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
#           'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
#           'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
#           'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
#           'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
#           'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
#           'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
#           'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
#           'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
#           'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
#           'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

# us_emails = ['gmail', 'net', 'edu']

# # https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
# for c in ['P_emaildomain', 'R_emaildomain']:
#     df_train[c + '_bin'] = df_train[c].map(emails)
#     df_test[c + '_bin'] = df_test[c].map(emails)
    
#     df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1]) # find .com, ,mx ..
#     df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    
#     df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
#     df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us') 
#     # if not US,keep it as it is; if US, turn into suffix 

# Hours and Days 

In [7]:
def day_encoder(df, offset = 0, colname = 'TransactionDT'):
    days = df[colname] / (3600*24)
    encode_days = np.floor(days - 1 + offset) % 7 #review
    return encode_days

def hour_encoder(df, colname = 'TransactionDT'):
    hours = df_train['TransactionDT'] / 3600
    encode_hours = np.floor(hours) % 24 #review
    return encode_hours


In [8]:
df_train['weekday'] = day_encoder(df_train, offset = 0.58)
df_test['weekday'] = day_encoder(df_test, offset=  0.58)

In [9]:
df_train['hours'] = hour_encoder(df_train)
df_test['hours'] = hour_encoder(df_test)

# Encoding categorical features

In [10]:
# Label Encoding
for f in df_train.drop('isFraud', axis=1).columns:
    if df_train[f].dtype=='object' or df_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_train[f].values) + list(df_test[f].values))
        df_train[f] = lbl.transform(list(df_train[f].values))
        df_test[f] = lbl.transform(list(df_test[f].values))   

# Some feature engineering

In [11]:
# this is normalizing 
df_train['Trans_min_mean'] = df_train['TransactionAmt'] - df_train['TransactionAmt'].mean()
df_train['Trans_min_std'] = df_train['Trans_min_mean'] / df_train['TransactionAmt'].std()
df_test['Trans_min_mean'] = df_test['TransactionAmt'] - df_test['TransactionAmt'].mean()
df_test['Trans_min_std'] = df_test['Trans_min_mean'] / df_test['TransactionAmt'].std()

In [None]:
# df_train['TransactionAmt_to_mean_card1'] = df_train['TransactionAmt'] / df_train.groupby(['card1'])['TransactionAmt'].transform('mean')
# df_train['TransactionAmt_to_mean_card4'] = df_train['TransactionAmt'] / df_train.groupby(['card4'])['TransactionAmt'].transform('mean')
# df_train['TransactionAmt_to_std_card1'] = df_train['TransactionAmt'] / df_train.groupby(['card1'])['TransactionAmt'].transform('std')
# df_train['TransactionAmt_to_std_card4'] = df_train['TransactionAmt'] / df_train.groupby(['card4'])['TransactionAmt'].transform('std')

# df_test['TransactionAmt_to_mean_card1'] = df_test['TransactionAmt'] / df_test.groupby(['card1'])['TransactionAmt'].transform('mean')
# df_test['TransactionAmt_to_mean_card4'] = df_test['TransactionAmt'] / df_test.groupby(['card4'])['TransactionAmt'].transform('mean')
# df_test['TransactionAmt_to_std_card1'] = df_test['TransactionAmt'] / df_test.groupby(['card1'])['TransactionAmt'].transform('std')
# df_test['TransactionAmt_to_std_card4'] = df_test['TransactionAmt'] / df_test.groupby(['card4'])['TransactionAmt'].transform('std')

In [12]:
df_train['TransactionAmt'] = np.log(df_train['TransactionAmt'])
df_test['TransactionAmt'] = np.log(df_test['TransactionAmt'])

# Concating dfs to get PCA of V features

In [13]:
df_test['isFraud'] = 'test'
df = pd.concat([df_train, df_test], axis=0, sort=False )
df = df.reset_index()
df = df.drop('index', axis=1)

In [14]:
def PCA_change(df, cols, n_components, prefix='PCA_', rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)

    principalComponents = pca.fit_transform(df[cols])

    principalDf = pd.DataFrame(principalComponents)

    df.drop(cols, axis=1, inplace=True)

    principalDf.rename(columns=lambda x: str(prefix)+str(x), inplace=True)

    df = pd.concat([df, principalDf], axis=1)
    
    return df

In [15]:
mas_v = df_train.columns[55:394]

# Getting PCA 

In [16]:
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans

for col in mas_v:
    df[col] = df[col].fillna((df[col].min() - 2))
    df[col] = (minmax_scale(df[col], feature_range=(0,1)))

    
df = PCA_change(df, mas_v, prefix='PCA_V_', n_components=30)

In [None]:
# df = reduce_mem_usage(df)

# Seting train and test back

In [17]:
df_train, df_test = df[df['isFraud'] != 'test'], df[df['isFraud'] == 'test'].drop('isFraud', axis=1)

In [18]:
df_train.shape

(590540, 129)

# Seting X and y

In [19]:
X_train = df_train.sort_values('TransactionDT').drop(['isFraud', 
                                                      'TransactionDT', 
                                                      #'Card_ID'
                                                     ],
                                                     axis=1)
y_train = df_train.sort_values('TransactionDT')['isFraud'].astype(bool)

X_test = df_test.sort_values('TransactionDT').drop(['TransactionDT',
                                                    #'Card_ID'
                                                   ], 
                                                   axis=1)
del df_train
df_test = df_test[["TransactionDT"]]

# JC trying LightBGM grid search 

In [None]:
# from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# import lightgbm as lgb

In [None]:
# search = {
#     "verbosity": [-5],
#     "num_boost_round": [250, 750,1500],
#     "max_depth": [250, 500, 750],
#     "learning_rate": np.linspace(1e-3, 1,5),
#     "num_leaves": [5,10,20],
#     "min_child_samples": [10,30,50],
#     "min_child_weight": [1e-3],
#     "subsample": np.linspace(0.1,1,5),  # subsample = bagging
# #     "subsample_freq": [],
#     "colsample_bytree": [0.5, 0.7,1.0]  # feature fraction: "mtry"
#     #"max_delta_step": -1,
# #     "reg_alpha": [0.05],      # L1
# #     "reg_lambda": [2]  # ,   # L2
#     # "min_split_gain": 0.0,
#     # "drop_rate": 0.1, # dart only
#     # "max_drop": 50, # dart only
#     # "skip_drop": 0.5, # dart only
#     # "uniform_drop": False, # dart only
#     # "top_rate": 0.2, # goss only
#     # "other_rate": 0.1, # goss only
#     # "min_data_per_group": 100,
#     # "max_cat_threshold": 32,
#     # "cat_l2": 10.0,
#     # "cat_smooth": 10.0,
#     # "max_cat_to_onehot": 4,
#     # "topk": 20, # larger -> more accurate but slow
    
# }

# ## n_jobs sets number of cores to parallelize on, set to 4 if you have 8 cores if you are getting bottlenecked
# ## verbose defines how much information is printed to console during search

# tss = TimeSeriesSplit(n_splits=7)
# model_grid = GridSearchCV(lgb.LGBMRegressor(
#     objective="binary", metric="auc", boosting_type="gbdt", device_type="cpu", tree_learner="feature"),
#     search, cv=tss)

# model_grid.fit(X_train,y_train)

# params = model_grid.best_params_

# print(f'Best grid search parameters:\n      {params}')
# print(f'Best grid search loss:\n         {model_grid.best_score_}')


# Defining the HyperOpt function with parameters space and model

In [None]:
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.metrics import roc_auc_score
# from xgboost import plot_importance
# from sklearn.metrics import make_scorer

# import time
# def objective(params, FOLDS = 7):
#     time1 = time.time()
#     params = {
#         'max_depth': int(params['max_depth']),
#         'gamma': "{:.3f}".format(params['gamma']),
#         'subsample': "{:.2f}".format(params['subsample']),
#         'reg_alpha': "{:.3f}".format(params['reg_alpha']),
#         'reg_lambda': "{:.3f}".format(params['reg_lambda']),
#         'learning_rate': "{:.3f}".format(params['learning_rate']),
#         'num_leaves': '{:.3f}'.format(params['num_leaves']),
#         'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
#         'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
#         'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
#         'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
#     }

#     print("\n############## New Run ################")
#     print(f"params = {params}")
    
#     count=1

#     tss = TimeSeriesSplit(n_splits=FOLDS)
#     y_preds = np.zeros(sample_submission.shape[0])
#     y_oof = np.zeros(X_train.shape[0])
#     score_mean = 0
#     for tr_idx, val_idx in tss.split(X_train, y_train):
#         clf = xgb.XGBClassifier(
#             n_estimators=600, random_state=4, verbose=True,  
#             **params
#         )

#         X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
#         y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
#         clf.fit(X_tr, y_tr)
#         #y_pred_train = clf.predict_proba(X_vl)[:,1]
#         #print(y_pred_train)
#         score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
#         # plt.show()
#         score_mean += score
#         print(f'{count} CV - score: {round(score, 4)}')
#         count += 1
#     time2 = time.time() - time1
#     print(f"Total Time Run: {round(time2 / 60,2)}")
#     gc.collect()
#     print(f'Mean ROC_AUC: {score_mean / FOLDS}')
#     del X_tr, X_vl, y_tr, y_vl, clf, score
#     return -(score_mean / FOLDS)


# space = {
#     # The maximum depth of a tree, same as GBM.
#     # Used to control over-fitting as higher depth will allow model 
#     # to learn relations very specific to a particular sample.
#     # Should be tuned using CV.
#     # Typical values: 3-10
#     'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
#     # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
#     # (meaning pulling weights to 0). It can be more useful when the objective
#     # is logistic regression since you might need help with feature selection.
#     'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
#     # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
#     # approach can be more useful in tree-models where zeroing 
#     # features might not make much sense.
#     'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
#     # eta: Analogous to learning rate in GBM
#     # Makes the model more robust by shrinking the weights on each step
#     # Typical final values to be used: 0.01-0.2
#     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    
#     # colsample_bytree: Similar to max_features in GBM. Denotes the 
#     # fraction of columns to be randomly samples for each tree.
#     # Typical values: 0.5-1
#     'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
#     # A node is split only when the resulting split gives a positive
#     # reduction in the loss function. Gamma specifies the 
#     # minimum loss reduction required to make a split.
#     # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
#     'gamma': hp.uniform('gamma', 0.01, .7),
    
#     # more increases accuracy, but may lead to overfitting.
#     # num_leaves: the number of leaf nodes to use. Having a large number 
#     # of leaves will improve accuracy, but will also lead to overfitting.
#     'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    
#     # specifies the minimum samples per leaf node.
#     # the minimum number of samples (data) to group into a leaf. 
#     # The parameter can greatly assist with overfitting: larger sample
#     # sizes per leaf will reduce overfitting (but may lead to under-fitting).
#     'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
#     # subsample: represents a fraction of the rows (observations) to be 
#     # considered when building each subtree. Tianqi Chen and Carlos Guestrin
#     # in their paper A Scalable Tree Boosting System recommend 
#     'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
#     # randomly select a fraction of the features.
#     # feature_fraction: controls the subsampling of features used
#     # for training (as opposed to subsampling the actual training data in 
#     # the case of bagging). Smaller fractions reduce overfitting.
#     'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
#     # randomly bag or subsample training data.
#     'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
#     # bagging_fraction and bagging_freq: enables bagging (subsampling) 
#     # of the training data. Both values need to be set for bagging to be used.
#     # The frequency controls how often (iteration) bagging is used. Smaller
#     # fractions and frequencies reduce overfitting.
# }


# Running the optimizer

In [None]:
# # Set algoritm parameters
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=27)

# # Print best parameters
# best_params = space_eval(space, best)

# Best parameters

In [None]:
# print("BEST PARAMS: ", best_params)

# best_params['max_depth'] = int(best_params['max_depth'])

# Trainning and Predicting with best Parameters

## Predicting X test

In [None]:
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)
clf = xgb.XGBClassifier(n_estimators = 500, 
                       n_jobs = 4, 
                       max_depth = 9, 
                       learning_rate = 0.05, 
                       sub_sample = 0.9, 
                       colsample_bytree = 0.9, 
                       missing = -999)


clf.fit(X_train, y_train)

y_preds = clf.predict_proba(X_test)[:,1] 

# Top 20 Feature importance

In [None]:
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 10 features
data.head(20)

## Seting y_pred to csv

In [None]:
sample_submission['isFraud'] = y_preds
sample_submission.to_csv('XGB_hypopt_model_wocardnorm.csv')