In [1]:
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file

# For visualization
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pd.set_option('display.max_rows', None)
df_ts = pd.read_csv('../data/BankChurners.csv')
df_ts.head()

Unnamed: 0,CustomerId,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,15762418,Spain,3,121681.82,1,1,0,128643.35,1,8
1,15749905,Spain,6,0.0,1,1,0,50213.81,1,7
2,15600911,France,2,182888.08,1,1,0,3061.0,0,7
3,15572762,Germany,2,102278.79,2,1,0,89822.48,0,2
4,15627848,France,7,109346.13,2,1,0,102665.92,0,7


In [3]:
drop_list = ['CustomerId']
df_ts = df_ts.drop(drop_list, axis = 1)
df_ts.head()

Unnamed: 0,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,Spain,3,121681.82,1,1,0,128643.35,1,8
1,Spain,6,0.0,1,1,0,50213.81,1,7
2,France,2,182888.08,1,1,0,3061.0,0,7
3,Germany,2,102278.79,2,1,0,89822.48,0,2
4,France,7,109346.13,2,1,0,102665.92,0,7


In [4]:
columns = list(df_ts.columns)
print(columns)

['Geography', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'CreditLevel']


In [5]:
non_normalization_list = ['Geography', 'CreditLevel']

for column in columns:
    if column not in non_normalization_list:
        print(str(column), " xmin: ", df_ts[column].min() , " xmax: ", df_ts[column].max() )

Tenure  xmin:  0  xmax:  10
Balance  xmin:  0.0  xmax:  250898.09
NumOfProducts  xmin:  1  xmax:  4
HasCrCard  xmin:  0  xmax:  1
IsActiveMember  xmin:  0  xmax:  1
EstimatedSalary  xmin:  11.58  xmax:  199970.74
Exited  xmin:  0  xmax:  1


In [6]:
def normalization(x):
    result = (x-x.min())/(x.max()-x.min())
    return result

In [7]:
for column in columns:
    if column not in non_normalization_list:
        df_ts[column] = normalization(df_ts[column])
        
# df_ts['CreditLevel'] = normalization(df_ts['CreditLevel'], 0, 10)

In [8]:
df_ts.head()

Unnamed: 0,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,Spain,0.3,0.484985,0.0,1.0,0.0,0.64329,1.0,8
1,Spain,0.6,0.0,0.0,1.0,0.0,0.251062,1.0,7
2,France,0.2,0.728934,0.0,1.0,0.0,0.01525,0.0,7
3,Germany,0.2,0.407651,0.333333,1.0,0.0,0.449146,0.0,2
4,France,0.7,0.435819,0.333333,1.0,0.0,0.513377,0.0,7


In [9]:
countries = df_ts["Geography"].unique()

In [10]:
print(countries)

['Spain' 'France' 'Germany']


In [11]:
for country in countries:
    count = df_ts [ df_ts["Geography"]== country ].shape[0]
    print(country , " : ", count)

Spain  :  2253
France  :  4510
Germany  :  2237


In [12]:
credit_lvs = [1,2,3,4,5,6,7,8,9]

for lv in credit_lvs:
    count = df_ts [ df_ts["CreditLevel"]== lv ].shape[0]
    print(lv , " : ", count)

1  :  12
2  :  98
3  :  383
4  :  895
5  :  1425
6  :  1883
7  :  1899
8  :  1309
9  :  717


In [13]:
geo = pd.get_dummies(df_ts["Geography"])
df_ts.drop(["Geography"],axis=1,inplace=True)
df_ts = pd.concat([df_ts,geo], axis=1)#, join="inner"
df_creditlevel = df_ts.pop('CreditLevel') # remove column of label and store it in df1
df_ts['CreditLevel']= df_creditlevel

In [14]:
df_ts.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,0.3,0.484985,0.0,1.0,0.0,0.64329,1.0,0,0,1,8
1,0.6,0.0,0.0,1.0,0.0,0.251062,1.0,0,0,1,7
2,0.2,0.728934,0.0,1.0,0.0,0.01525,0.0,1,0,0,7
3,0.2,0.407651,0.333333,1.0,0.0,0.449146,0.0,0,1,0,2
4,0.7,0.435819,0.333333,1.0,0.0,0.513377,0.0,1,0,0,7


In [15]:
df_ts.to_csv('../data/BankChurners_normalized.csv', index=False)

In [16]:
def standardization(x):
    result = (x-x.mean())/x.var()
    return result

In [17]:
for column in columns:
    if column not in non_normalization_list:
        df_ts[column] = standardization(df_ts[column])

In [18]:
df_ts.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1,8
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1,7
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0,7
3,-3.58572,1.681428,4.165062,1.414715,-2.055505,-0.605623,-1.258601,0,1,0,2
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0,7


In [19]:
df_ts.to_csv('../data/BankChurners_normalized_standardized.csv', index=False)