In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler


In [2]:
dataset = pd.read_csv('datasets/Bank_Personal_Loan_Modelling.csv')

dataset.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [3]:
dataset.drop(['ID'], axis=1, inplace=True)

In [4]:
dataset['Age'] = dataset['Age'].apply(lambda value: int(value/10)*10)

In [5]:
def one_hot_encode(df, column):
        df = df.join(pd.get_dummies(df[column], prefix=column))
        return df.drop([column], axis = 1) 

dataset = one_hot_encode(dataset, 'Securities Account')
dataset = one_hot_encode(dataset, 'CD Account')
dataset = one_hot_encode(dataset, 'Online')
dataset = one_hot_encode(dataset, 'CreditCard')


In [6]:
dataset.head()

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account_0,Securities Account_1,CD Account_0,CD Account_1,Online_0,Online_1,CreditCard_0,CreditCard_1
0,20,1,49,91107,4,1.6,1,0,0,0,1,1,0,1,0,1,0
1,40,19,34,90089,3,1.5,1,0,0,0,1,1,0,1,0,1,0
2,30,15,11,94720,1,1.0,1,0,0,1,0,1,0,1,0,1,0
3,30,9,100,94112,1,2.7,2,0,0,1,0,1,0,1,0,1,0
4,30,8,45,91330,4,1.0,2,0,0,1,0,1,0,1,0,0,1


In [7]:
def label_encode(df, column):
    le = preprocessing.LabelEncoder()
    values = list(df[column].values)
    le.fit(values)
    df[column] = le.transform(values)
    return df

dataset = label_encode(dataset, 'ZIP Code')
dataset.head()

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account_0,Securities Account_1,CD Account_0,CD Account_1,Online_0,Online_1,CreditCard_0,CreditCard_1
0,20,1,49,83,4,1.6,1,0,0,0,1,1,0,1,0,1,0
1,40,19,34,34,3,1.5,1,0,0,0,1,1,0,1,0,1,0
2,30,15,11,367,1,1.0,1,0,0,1,0,1,0,1,0,1,0
3,30,9,100,298,1,2.7,2,0,0,1,0,1,0,1,0,1,0
4,30,8,45,96,4,1.0,2,0,0,1,0,1,0,1,0,0,1


In [8]:
def scale_normalize(df, columns):
    df[columns] = MinMaxScaler().fit_transform(df[columns])
    for column in columns:
        df[column] = df[column].apply(lambda x: np.log(x + 1))
    return df

features = dataset.drop(['Personal Loan'], axis = 1)
labels = dataset['Personal Loan']

features = scale_normalize(features, ['Age', 'Experience', 'Income', 'ZIP Code' ,'Family', 'CCAvg', 'Education', 'Mortgage'])
print(features.shape)
features.head()

(5000, 16)


Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account_0,Securities Account_1,CD Account_0,CD Account_1,Online_0,Online_1,CreditCard_0,CreditCard_1
0,0.0,0.083382,0.173798,0.163913,0.693147,0.14842,0.0,0.0,0,1,1,0,1,0,1,0
1,0.405465,0.390866,0.113659,0.070422,0.510826,0.139762,0.0,0.0,0,1,1,0,1,0,1,0
2,0.223144,0.330242,0.013793,0.580848,0.0,0.09531,0.0,0.0,1,0,1,0,1,0,1,0
3,0.223144,0.231802,0.354821,0.494382,0.0,0.239017,0.405465,0.0,1,0,1,0,1,0,1,0
4,0.223144,0.21441,0.158111,0.187316,0.693147,0.09531,0.405465,0.0,1,0,1,0,1,0,0,1


In [9]:
features.describe()

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account_0,Securities Account_1,CD Account_0,CD Account_1,Online_0,Online_1,CreditCard_0,CreditCard_1
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.400328,0.392798,0.253269,0.396892,0.347159,0.167607,0.321856,0.07602,0.8956,0.1044,0.9396,0.0604,0.4032,0.5968,0.706,0.294
std,0.204485,0.169849,0.156327,0.192652,0.266607,0.134345,0.2946,0.130668,0.305809,0.305809,0.23825,0.23825,0.490589,0.490589,0.455637,0.455637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.223144,0.248896,0.13411,0.247731,0.0,0.067659,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,0.405465,0.405465,0.230524,0.435066,0.287682,0.139762,0.405465,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
75%,0.559616,0.540806,0.348307,0.562677,0.510826,0.223144,0.693147,0.147605,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
max,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
dataset = features
dataset['Personal Loan'] = labels
dataset.head()

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account_0,Securities Account_1,CD Account_0,CD Account_1,Online_0,Online_1,CreditCard_0,CreditCard_1,Personal Loan
0,0.0,0.083382,0.173798,0.163913,0.693147,0.14842,0.0,0.0,0,1,1,0,1,0,1,0,0
1,0.405465,0.390866,0.113659,0.070422,0.510826,0.139762,0.0,0.0,0,1,1,0,1,0,1,0,0
2,0.223144,0.330242,0.013793,0.580848,0.0,0.09531,0.0,0.0,1,0,1,0,1,0,1,0,0
3,0.223144,0.231802,0.354821,0.494382,0.0,0.239017,0.405465,0.0,1,0,1,0,1,0,1,0,0
4,0.223144,0.21441,0.158111,0.187316,0.693147,0.09531,0.405465,0.0,1,0,1,0,1,0,0,1,0


In [11]:
dataset.to_csv('datasets/dataset_processed.csv', index = False)