In [108]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import os
import collections
from matplotlib import pyplot as plt
import seaborn as sns
import imblearn

In [161]:
df = pd.read_csv('Banking Prediction Sample 1 - Dataset.csv')
df = df.drop(['Customer_ID'], axis=1)
df.head(5)

Unnamed: 0,Gender,Age,Tenure,Saving_Amount,Current_Amount,Time_Deposits_Amount,Funds_Amount,Stocks_Amount,Bank_Assurance_Amount,Life_Assurance_Amount,Business_Loan_Amount,Home_Loan_Amount,Consumer_Loan_Amount,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders,New_Credit_Card_Flag
0,M,52,49,0.0,0.0,0.0,0.0,0.0,139.194286,0.0,0.0,20044.611429,0.0,0,0,0,0,0,0
1,F,59,49,0.0,0.0,0.0,0.0,0.0,0.0,32.205714,0.0,0.0,2430.224286,4,3,0,0,0,0
2,M,52,49,18.825714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0
3,M,54,49,0.0,603.791429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,7,0,0,12,0
4,F,61,49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.72,0,0,0,0,0,0


In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104733 entries, 0 to 104732
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Customer_ID            104733 non-null  int64  
 1   Gender                 104733 non-null  object 
 2   Age                    104733 non-null  int64  
 3   Tenure                 104733 non-null  int64  
 4   Saving_Amount          104733 non-null  float64
 5   Current_Amount         104733 non-null  float64
 6   Time_Deposits_Amount   104733 non-null  float64
 7   Funds_Amount           104733 non-null  float64
 8   Stocks_Amount          104733 non-null  float64
 9   Bank_Assurance_Amount  104733 non-null  float64
 10  Life_Assurance_Amount  104733 non-null  float64
 11  Business_Loan_Amount   104733 non-null  float64
 12  Home_Loan_Amount       104733 non-null  float64
 13  Consumer_Loan_Amount   104733 non-null  float64
 14  Branch_Transactions    104733 non-nu

In [111]:
df.shape

(104733, 20)

In [112]:
for column in df.columns:
    print(column,df[column].nunique())

Customer_ID 104733
Gender 2
Age 73
Tenure 174
Saving_Amount 58918
Current_Amount 22366
Time_Deposits_Amount 3444
Funds_Amount 5064
Stocks_Amount 7173
Bank_Assurance_Amount 3667
Life_Assurance_Amount 814
Business_Loan_Amount 2462
Home_Loan_Amount 4072
Consumer_Loan_Amount 23907
Branch_Transactions 144
ATM_Transactions 95
Phone_Transactions 21
Internet_Transactions 16
Standing_Orders 33
New_Credit_Card_Flag 2


In [157]:
df.nunique()

Customer_ID              104733
Gender                        2
Age                          73
Tenure                      174
Saving_Amount             58918
Current_Amount            22366
Time_Deposits_Amount       3444
Funds_Amount               5064
Stocks_Amount              7173
Bank_Assurance_Amount      3667
Life_Assurance_Amount       814
Business_Loan_Amount       2462
Home_Loan_Amount           4072
Consumer_Loan_Amount      23907
Branch_Transactions         144
ATM_Transactions             95
Phone_Transactions           21
Internet_Transactions        16
Standing_Orders              33
New_Credit_Card_Flag          2
dtype: int64

In [113]:
df['New_Credit_Card_Flag'] = df['New_Credit_Card_Flag'].astype(bool)

In [114]:
df.dtypes

Customer_ID                int64
Gender                    object
Age                        int64
Tenure                     int64
Saving_Amount            float64
Current_Amount           float64
Time_Deposits_Amount     float64
Funds_Amount             float64
Stocks_Amount            float64
Bank_Assurance_Amount    float64
Life_Assurance_Amount    float64
Business_Loan_Amount     float64
Home_Loan_Amount         float64
Consumer_Loan_Amount     float64
Branch_Transactions        int64
ATM_Transactions           int64
Phone_Transactions         int64
Internet_Transactions      int64
Standing_Orders            int64
New_Credit_Card_Flag        bool
dtype: object

In [115]:
df['New_Credit_Card_Flag'].value_counts()

False    98933
True      5800
Name: New_Credit_Card_Flag, dtype: int64

In [116]:
df.corrwith(df['New_Credit_Card_Flag'])

Customer_ID             -0.070590
Age                     -0.043864
Tenure                   0.028626
Saving_Amount           -0.021669
Current_Amount          -0.006670
Time_Deposits_Amount    -0.010456
Funds_Amount            -0.010447
Stocks_Amount           -0.002930
Bank_Assurance_Amount    0.007735
Life_Assurance_Amount    0.012289
Business_Loan_Amount     0.018786
Home_Loan_Amount         0.002184
Consumer_Loan_Amount     0.087272
Branch_Transactions      0.086831
ATM_Transactions         0.091312
Phone_Transactions       0.059876
Internet_Transactions    0.243737
Standing_Orders          0.015878
New_Credit_Card_Flag     1.000000
dtype: float64

In [117]:
categorical_features =[feature for feature in df.columns if ((df[feature].dtypes=='O')& (feature not in ['Time_Deposits_Flag']))]
categorical_features

['Gender']

In [118]:
df = df.drop(['Customer_ID'], axis=1)

In [119]:
X = df.drop(['New_Credit_Card_Flag'], axis=1)
y = df['New_Credit_Card_Flag']


In [120]:
X

Unnamed: 0,Gender,Age,Tenure,Saving_Amount,Current_Amount,Time_Deposits_Amount,Funds_Amount,Stocks_Amount,Bank_Assurance_Amount,Life_Assurance_Amount,Business_Loan_Amount,Home_Loan_Amount,Consumer_Loan_Amount,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders
0,M,52,49,0.000000,0.000000,0.0,0.0,0.0,139.194286,0.000000,0.0,20044.611429,0.000000,0,0,0,0,0
1,F,59,49,0.000000,0.000000,0.0,0.0,0.0,0.000000,32.205714,0.0,0.000000,2430.224286,4,3,0,0,0
2,M,52,49,18.825714,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0,0,0,0,0
3,M,54,49,0.000000,603.791429,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,7,7,0,0,12
4,F,61,49,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,2.720000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104728,M,56,0,112.900000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,1,0,0,0,0
104729,F,73,0,16.130000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,1,0,0,0,0
104730,M,24,0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,129.030000,0,0,0,0,0
104731,M,35,0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,807.740000,0,0,0,0,0


In [121]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy = "not majority")

X_res, y_res = ros.fit_resample(X,y)


In [122]:
y_res.value_counts()

False    98933
True     98933
Name: New_Credit_Card_Flag, dtype: int64

In [123]:
X.corr()

Unnamed: 0,Age,Tenure,Saving_Amount,Current_Amount,Time_Deposits_Amount,Funds_Amount,Stocks_Amount,Bank_Assurance_Amount,Life_Assurance_Amount,Business_Loan_Amount,Home_Loan_Amount,Consumer_Loan_Amount,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders
Age,1.0,0.27698,0.085108,0.019228,0.060043,0.102992,0.025087,0.002291,-0.014322,0.000236,-0.026403,-0.060986,0.064116,-0.199715,-0.006125,-0.001176,0.042738
Tenure,0.27698,1.0,0.066578,0.075896,0.025367,0.054677,0.027641,0.026805,0.000981,0.013352,-0.000991,-0.151886,0.070726,-0.008736,0.024493,0.023568,0.068998
Saving_Amount,0.085108,0.066578,1.0,0.023134,0.173275,0.091285,0.02765,0.01655,0.004775,0.026416,0.04311,-0.056647,0.293369,-0.036067,0.020314,0.011242,0.084533
Current_Amount,0.019228,0.075896,0.023134,1.0,0.015273,0.023893,0.010968,-0.004637,-0.005297,0.021965,-0.003259,-0.039307,0.098902,0.055112,0.012987,0.012969,0.056435
Time_Deposits_Amount,0.060043,0.025367,0.173275,0.015273,1.0,0.105389,0.010712,-0.006778,-0.003425,0.007184,0.004609,-0.023617,0.294641,-0.020141,0.002383,0.001826,0.020699
Funds_Amount,0.102992,0.054677,0.091285,0.023893,0.105389,1.0,0.023752,-0.004618,-0.006917,0.008219,-0.004171,-0.035899,0.080417,-0.029068,0.022036,0.000141,0.042468
Stocks_Amount,0.025087,0.027641,0.02765,0.010968,0.010712,0.023752,1.0,-0.002637,-0.002515,-0.000293,-0.001206,-0.012388,0.035367,-0.008813,0.004454,0.000749,0.014627
Bank_Assurance_Amount,0.002291,0.026805,0.01655,-0.004637,-0.006778,-0.004618,-0.002637,1.0,0.236936,0.11568,0.428798,-0.007568,0.071857,-0.019077,0.026039,0.016221,0.057064
Life_Assurance_Amount,-0.014322,0.000981,0.004775,-0.005297,-0.003425,-0.006917,-0.002515,0.236936,1.0,0.011268,0.283897,0.039593,0.043788,0.001141,0.017075,0.000113,0.065564
Business_Loan_Amount,0.000236,0.013352,0.026416,0.021965,0.007184,0.008219,-0.000293,0.11568,0.011268,1.0,0.020348,0.004032,0.132933,-0.019714,0.011317,0.002742,0.041146


In [124]:

oversampled_train = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis=1)
oversampled_train.corrwith(oversampled_train['New_Credit_Card_Flag'])


Age                     -0.109019
Tenure                   0.065389
Saving_Amount           -0.055970
Current_Amount          -0.010704
Time_Deposits_Amount    -0.027922
Funds_Amount            -0.024028
Stocks_Amount           -0.008625
Bank_Assurance_Amount    0.016462
Life_Assurance_Amount    0.023456
Business_Loan_Amount     0.030486
Home_Loan_Amount         0.003072
Consumer_Loan_Amount     0.167937
Branch_Transactions      0.176794
ATM_Transactions         0.170608
Phone_Transactions       0.062943
Internet_Transactions    0.195864
Standing_Orders          0.030234
New_Credit_Card_Flag     1.000000
dtype: float64

In [125]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy="majority")

X_rus, y_rus = rus.fit_resample(X,y)

In [126]:
y_rus.value_counts()

False    5800
True     5800
Name: New_Credit_Card_Flag, dtype: int64

In [127]:
und_df = pd.concat([pd.DataFrame(X_rus), pd.DataFrame(y_rus)], axis = 1)
und_df.corrwith(und_df['New_Credit_Card_Flag'])

Age                     -0.105282
Tenure                   0.063296
Saving_Amount           -0.054441
Current_Amount          -0.013029
Time_Deposits_Amount    -0.033168
Funds_Amount            -0.031469
Stocks_Amount           -0.004393
Bank_Assurance_Amount    0.017470
Life_Assurance_Amount    0.015385
Business_Loan_Amount     0.033750
Home_Loan_Amount         0.001523
Consumer_Loan_Amount     0.165870
Branch_Transactions      0.167852
ATM_Transactions         0.171840
Phone_Transactions       0.062442
Internet_Transactions    0.197255
Standing_Orders          0.032612
New_Credit_Card_Flag     1.000000
dtype: float64

In [128]:
und_df = und_df.drop(['Stocks_Amount'], axis=1)
und_df

Unnamed: 0,Gender,Age,Tenure,Saving_Amount,Current_Amount,Time_Deposits_Amount,Funds_Amount,Bank_Assurance_Amount,Life_Assurance_Amount,Business_Loan_Amount,Home_Loan_Amount,Consumer_Loan_Amount,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders,New_Credit_Card_Flag
0,F,28,83,1502.221429,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1,22,0,0,0,False
1,F,40,43,0.000000,25.748571,0.0,0.0,0.000000,0.0,0.0,0.0,1439.130000,1,0,0,0,0,False
2,F,27,16,56.504286,0.000000,0.0,0.0,6.587143,0.0,0.0,0.0,2206.418571,24,0,0,0,1,False
3,M,50,11,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,7409.128571,8,0,0,0,0,False
4,F,44,105,0.000000,250.195714,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,9,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11595,M,24,1,0.000000,42.340000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0,11,0,0,0,True
11596,F,28,1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0,0,0,0,0,True
11597,M,68,1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,3274.190000,0,0,0,0,0,True
11598,M,29,1,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0,2,0,0,0,True


In [129]:
und_df.corrwith(und_df['New_Credit_Card_Flag'])

Age                     -0.105282
Tenure                   0.063296
Saving_Amount           -0.054441
Current_Amount          -0.013029
Time_Deposits_Amount    -0.033168
Funds_Amount            -0.031469
Bank_Assurance_Amount    0.017470
Life_Assurance_Amount    0.015385
Business_Loan_Amount     0.033750
Home_Loan_Amount         0.001523
Consumer_Loan_Amount     0.165870
Branch_Transactions      0.167852
ATM_Transactions         0.171840
Phone_Transactions       0.062442
Internet_Transactions    0.197255
Standing_Orders          0.032612
New_Credit_Card_Flag     1.000000
dtype: float64

In [130]:
Amounts_sum = df['Saving_Amount'] + df['Current_Amount'] + df['Stocks_Amount'] + df['Time_Deposits_Amount'] + df['Funds_Amount']
df['Amounts_sum'] = Amounts_sum
df.head(5)

Unnamed: 0,Gender,Age,Tenure,Saving_Amount,Current_Amount,Time_Deposits_Amount,Funds_Amount,Stocks_Amount,Bank_Assurance_Amount,Life_Assurance_Amount,Business_Loan_Amount,Home_Loan_Amount,Consumer_Loan_Amount,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders,New_Credit_Card_Flag,Amounts_sum
0,M,52,49,0.0,0.0,0.0,0.0,0.0,139.194286,0.0,0.0,20044.611429,0.0,0,0,0,0,0,False,0.0
1,F,59,49,0.0,0.0,0.0,0.0,0.0,0.0,32.205714,0.0,0.0,2430.224286,4,3,0,0,0,False,0.0
2,M,52,49,18.825714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,False,18.825714
3,M,54,49,0.0,603.791429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,7,0,0,12,False,603.791429
4,F,61,49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.72,0,0,0,0,0,False,0.0


In [131]:
df = df.drop(['Saving_Amount','Current_Amount','Time_Deposits_Amount','Stocks_Amount','Funds_Amount'], axis=1)

In [132]:
df.head(5)

Unnamed: 0,Gender,Age,Tenure,Bank_Assurance_Amount,Life_Assurance_Amount,Business_Loan_Amount,Home_Loan_Amount,Consumer_Loan_Amount,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders,New_Credit_Card_Flag,Amounts_sum
0,M,52,49,139.194286,0.0,0.0,20044.611429,0.0,0,0,0,0,0,False,0.0
1,F,59,49,0.0,32.205714,0.0,0.0,2430.224286,4,3,0,0,0,False,0.0
2,M,52,49,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,False,18.825714
3,M,54,49,0.0,0.0,0.0,0.0,0.0,7,7,0,0,12,False,603.791429
4,F,61,49,0.0,0.0,0.0,0.0,2.72,0,0,0,0,0,False,0.0


In [133]:
abs(df.corrwith(df['New_Credit_Card_Flag']))

Age                      0.043864
Tenure                   0.028626
Bank_Assurance_Amount    0.007735
Life_Assurance_Amount    0.012289
Business_Loan_Amount     0.018786
Home_Loan_Amount         0.002184
Consumer_Loan_Amount     0.087272
Branch_Transactions      0.086831
ATM_Transactions         0.091312
Phone_Transactions       0.059876
Internet_Transactions    0.243737
Standing_Orders          0.015878
New_Credit_Card_Flag     1.000000
Amounts_sum              0.017291
dtype: float64

In [137]:
assurance_sum = df['Bank_Assurance_Amount'] + df['Life_Assurance_Amount']
df = df.drop(['Bank_Assurance_Amount','Life_Assurance_Amount' ], axis=1)

In [139]:
df['assurance_sum'] = assurance_sum

In [140]:
abs(df.corrwith(df['New_Credit_Card_Flag']))

Age                      0.043864
Tenure                   0.028626
Business_Loan_Amount     0.018786
Home_Loan_Amount         0.002184
Consumer_Loan_Amount     0.087272
Branch_Transactions      0.086831
ATM_Transactions         0.091312
Phone_Transactions       0.059876
Internet_Transactions    0.243737
Standing_Orders          0.015878
New_Credit_Card_Flag     1.000000
Amounts_sum              0.017291
assurance_sum            0.011337
dtype: float64

In [143]:
loan_sum = df['Business_Loan_Amount'] + df['Home_Loan_Amount'] + df['Consumer_Loan_Amount']
df['loan_sum'] = loan_sum
df = df.drop(['Business_Loan_Amount', 'Home_Loan_Amount', 'Consumer_Loan_Amount'], axis=1)
df.head(5)

Unnamed: 0,Gender,Age,Tenure,Branch_Transactions,ATM_Transactions,Phone_Transactions,Internet_Transactions,Standing_Orders,New_Credit_Card_Flag,Amounts_sum,assurance_sum,loan_sum
0,M,52,49,0,0,0,0,0,False,0.0,139.194286,20044.611429
1,F,59,49,4,3,0,0,0,False,0.0,32.205714,2430.224286
2,M,52,49,0,0,0,0,0,False,18.825714,0.0,0.0
3,M,54,49,7,7,0,0,12,False,603.791429,0.0,0.0
4,F,61,49,0,0,0,0,0,False,0.0,0.0,2.72


In [147]:
trans_sum = df['Branch_Transactions'] + df['ATM_Transactions'] + df['Phone_Transactions'] + df['Internet_Transactions']
df['trans_sum'] = trans_sum
df = df.drop(['Branch_Transactions','ATM_Transactions', 'Phone_Transactions', 'Internet_Transactions'], axis=1)
df.head(5)

Unnamed: 0,Gender,Age,Tenure,Standing_Orders,New_Credit_Card_Flag,Amounts_sum,assurance_sum,loan_sum,trans_sum
0,M,52,49,0,False,0.0,139.194286,20044.611429,0
1,F,59,49,0,False,0.0,32.205714,2430.224286,7
2,M,52,49,0,False,18.825714,0.0,0.0,0
3,M,54,49,12,False,603.791429,0.0,0.0,14
4,F,61,49,0,False,0.0,0.0,2.72,0


In [148]:
df.corrwith(df['New_Credit_Card_Flag'])

Age                    -0.043864
Tenure                  0.028626
Standing_Orders         0.015878
New_Credit_Card_Flag    1.000000
Amounts_sum            -0.017291
assurance_sum           0.011337
loan_sum                0.026516
trans_sum               0.134822
dtype: float64

In [149]:
X = df.drop(['New_Credit_Card_Flag'], axis=1)
y = df['New_Credit_Card_Flag']

In [151]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy = "not majority")

X_res, y_res = ros.fit_resample(X,y)

In [153]:
overS = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis=1)
overS.corrwith(oversampled_train['New_Credit_Card_Flag'])

Age                    -0.104686
Tenure                  0.065749
Standing_Orders         0.029047
Amounts_sum            -0.044875
assurance_sum           0.019685
loan_sum                0.049778
trans_sum               0.252995
New_Credit_Card_Flag    1.000000
dtype: float64