In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

import xgboost as xgb
from xgboost import XGBClassifier



In [2]:
pwd

'/home/ubuntu/nbs/Brainwaves'

In [4]:
train = pd.read_csv('./FraudSubmission/train.csv')
#test = pd.read_csv('./FraudTransaction/test.csv')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
sns.distplot(train['target'], kde=False)

In [None]:
train.head()

In [16]:
train.columns

Index(['transaction_id', 'num_var_1', 'num_var_2', 'num_var_3', 'num_var_4',
       'num_var_5', 'num_var_6', 'num_var_7', 'cat_var_1', 'cat_var_2',
       'cat_var_3', 'cat_var_4', 'cat_var_5', 'cat_var_6', 'cat_var_7',
       'cat_var_8', 'cat_var_9', 'cat_var_10', 'cat_var_11', 'cat_var_12',
       'cat_var_13', 'cat_var_14', 'cat_var_15', 'cat_var_16', 'cat_var_17',
       'cat_var_18', 'cat_var_19', 'cat_var_20', 'cat_var_21', 'cat_var_22',
       'cat_var_23', 'cat_var_24', 'cat_var_25', 'cat_var_26', 'cat_var_27',
       'cat_var_28', 'cat_var_29', 'cat_var_30', 'cat_var_31', 'cat_var_32',
       'cat_var_33', 'cat_var_34', 'cat_var_35', 'cat_var_36', 'cat_var_37',
       'cat_var_38', 'cat_var_39', 'cat_var_40', 'cat_var_41', 'cat_var_42',
       'target'],
      dtype='object')

In [19]:
#Missing Data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
target,0,0.0
cat_var_5,0,0.0
cat_var_15,0,0.0
cat_var_14,0,0.0
cat_var_13,0,0.0


In [5]:
#cat_var_1 -- gf
#cat_var_3 -- qt
#cat_var_8 -- dn
train['cat_var_1'].fillna(value='gf',inplace=True)
train['cat_var_3'].fillna(value='qt',inplace=True)
train['cat_var_8'].fillna(value='dn',inplace=True)

In [6]:
count_class_0, count_class_1 = train.target.value_counts()
print(count_class_0, count_class_1)
print(count_class_1/count_class_0)

311610 37368
0.119919129681


In [14]:
(count_class_0*2)/5

35726.800000000003

In [7]:
#Resample the data
# Class count
count_class_0, count_class_1 = train.target.value_counts()

# Divide by class
df_class_0 = train[train['target'] == 0]
df_class_1 = train[train['target'] == 1]

df_class_1_over = df_class_1.sample(int((count_class_0*2)/5), replace=True)

train_over = pd.concat([df_class_0, df_class_1_over], axis=0)

# Shuffle data
idx = np.arange(len(train_over))
np.random.shuffle(idx)
train_over = train_over.iloc[idx]

train = train_over

print(train.shape)

(436254, 51)


In [8]:
cat_cols = [x for x in train.columns if x.startswith('cat')]

In [21]:
#cat_cols

In [9]:
train_features = [x for x in train.columns if x not in ['transaction_id','target']]

In [10]:
#Encode the data
for column in train_features:
    if train[column].dtype == type(object):
        le = LabelEncoder()
        le.fit(train[column])
        train[column] = le.transform(train[column])

In [11]:
df_with_dummies = pd.get_dummies(train, columns = cat_cols)

In [27]:
#df_with_dummies['cat_var_1']

In [12]:
X = train[train_features].values
y = train.loc[:,'target'].values
#T = test[train_features].values

In [15]:
train.head()

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42,target
336287,id_819275,1.795395e-06,0.08914,0.0,3.55e-07,4.671053e-08,1.795395e-06,8.937571e-07,127,3,...,0,0,0,0,0,0,0,0,0,0
91239,id_570764,5.526316e-08,0.089669,0.0,3.55e-07,4.671053e-08,4.407895e-08,2.389725e-08,127,3,...,0,0,0,0,0,0,0,0,0,0
116852,id_731214,6.578947e-09,0.083007,0.0,3.55e-07,4.671053e-08,4.407895e-08,2.23041e-09,127,3,...,0,0,0,0,0,0,0,0,0,0
212858,id_305238,1.644737e-07,0.085439,0.0,3.55e-07,4.671053e-08,4.407895e-08,3.026985e-08,127,3,...,0,0,0,0,0,0,0,0,0,0
4681,id_29605,4.013158e-08,0.348419,0.0,1.086e-05,1.176316e-06,4.539474e-08,1.943643e-08,139,3,...,0,0,0,0,0,0,0,0,0,0


In [14]:
stacker = RandomForestClassifier(n_estimators=200,random_state=0)
results = cross_val_score(stacker, X, y, cv=5, scoring='roc_auc')
print(results)
print("Stacker score: {} ".format(results.mean()))

[ 0.90779097  0.90668958  0.90655885  0.90466818  0.90613218]
Stacker score: 0.9063679517998906 


In [27]:
stacker.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [17]:
#process everything for test
test = pd.read_csv('./FraudSubmission/test.csv')

In [33]:
test.shape

(523466, 50)

In [18]:
#Missing Data
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
cat_var_3,53362,0.10194
cat_var_6,21943,0.041919
cat_var_1,18692,0.035708
cat_var_8,8138,0.015546
cat_var_42,0,0.0


In [23]:
#test['cat_var_6'].value_counts()

In [24]:
#fill missing values
test['cat_var_1'].fillna(value='gf',inplace=True)
test['cat_var_3'].fillna(value='qt',inplace=True)
test['cat_var_6'].fillna(value='zs',inplace=True)
test['cat_var_8'].fillna(value='dn',inplace=True)

#Encode the data
for column in train_features:
    if test[column].dtype == type(object):
        le = LabelEncoder()
        le.fit(test[column])
        test[column] = le.transform(test[column])
        
T = test[train_features].values

In [25]:
id_test = test['transaction_id'].values

In [31]:
y_pred = stacker.predict_proba(T)[:,1]

In [32]:
sub = pd.DataFrame()
sub['transaction_id'] = id_test
sub['target'] = y_pred
sub.to_csv('./FraudSubmission/sub_rf_200_no_dummies.csv', index=False)

print('completed')

completed


In [None]:
stacker = XGBClassifier(n_estimators=550,seed=0)
results = cross_val_score(stacker, X, y, cv=5, scoring='roc_auc')
print(results)
print("Stacker score: {} for num: ".format(results.mean()))

In [None]:
#Lets check the feature importance using XGB
features = train_features

xgb_params = {
    'n_estimators':200,
    'eta': 0.05,
    'max_depth': 4,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'min_child_weight':1,
    'silent': 1,
    'seed':0,
    'eval_metric':'auc'
}

x_Train, x_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0)

xgtrain = xgb.DMatrix(x_Train,y_Train, feature_names=features)
xgtest = xgb.DMatrix(x_Test,y_Test, feature_names=features)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
num_rounds = 100 # Increase the number of rounds while running in local
model = xgb.train(xgb_params, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=5)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
gb = RandomForestClassifier(n_estimators=200, max_depth=3, min_samples_leaf=4, max_features=0.2, random_state=0)
gb.fit(train[train_features], train.target)
features = train[train_features].columns.values
print("----- Training Done -----")

In [None]:
x, y = (list(x) for x in zip(*sorted(zip(gb.feature_importances_, features), 
                                                            reverse = False)))
feature_imp = pd.DataFrame(features)
feature_imp.columns = ['feature']
feature_imp['imp'] = gb.feature_importances_

In [None]:
feature_imp.sort_values(by='imp',ascending=False)

In [None]:
list(feature_imp.sort_values(by='imp',ascending=False)['feature'])

In [None]:
features = ['num_var_4', 'cat_var_14', 'cat_var_2', 'cat_var_13', 'num_var_7', 'num_var_2', 'cat_var_15', 'cat_var_17',
             'num_var_6', 'cat_var_18', 'cat_var_6', 'cat_var_12', 'num_var_1', 'cat_var_4', 'cat_var_20', 'cat_var_19',
             'num_var_5', 'cat_var_24', 'cat_var_21', 'cat_var_5', 'cat_var_7', 'cat_var_22', 'cat_var_16', 'cat_var_26',
            'cat_var_9', 'cat_var_30', 'cat_var_29']

In [None]:
X = train[features].values
y = train.loc[:,'target'].values

In [None]:
stacker = RandomForestClassifier(n_estimators=150,random_state=0)
results = cross_val_score(stacker, X, y, cv=5, scoring='roc_auc')
print(results)
print("Stacker score: {} for num: ".format(results.mean()))

In [None]:
stacker = XGBClassifier(n_estimators=350,seed=0)
results = cross_val_score(stacker, X, y, cv=5, scoring='roc_auc')
print(results)
print("Stacker score: {} for num: ".format(results.mean()))

In [None]:
train.columns

In [None]:
train.nunique()

In [None]:
sns.distplot(train['cat_var_2'], kde=False)

In [None]:
sns.jointplot(x='num_var_4',y='target',data=train)

In [None]:
train['cat_var_1'].sort_values().unique()

In [None]:
#cat_var_1 -- gf
#cat_var_3 -- qt
#cat_var_8 -- dn
train['cat_var_8'].value_counts()

In [None]:
train.sum()

In [None]:
train['cat_var_37'].value_counts()

In [None]:
pca = PCA(n_components=2)

from sklearn import preprocessing
data_scaled = pd.DataFrame(preprocessing.scale(X),columns = features) 

pca.fit(X)

#The amount of variance that each PC explains
var = pca.explained_variance_ratio_

#Cumulative Variance explains
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)