In [20]:
import pandas as pd
from path import Path

loans_df = pd.read_csv('Resources/loans_data.csv')
loans_df.head()
# for bad column,1:bad or denial, 0:good or approval

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,High School or Below,male,0
1,1000,30,July,50,Bachelor,female,0
2,1000,30,August,33,Bachelor,female,0
3,1000,15,September,27,college,male,0
4,1000,30,October,28,college,female,0


In [21]:
# month, education, gender will have to be converted into numbers
# this process is called encoding
# the steps taken to prepare the data to make them usable for building machine learning models are called preprocessing

# encoding the gender column
# convert male and female values into numerical ones with pd.get_dummies()
loans_binary_encoded = pd.get_dummies(loans_df, columns=['gender'])
loans_binary_encoded

Unnamed: 0,amount,term,month,age,education,bad,gender_female,gender_male
0,1000,30,June,45,High School or Below,0,0,1
1,1000,30,July,50,Bachelor,0,1,0
2,1000,30,August,33,Bachelor,0,1,0
3,1000,15,September,27,college,0,0,1
4,1000,30,October,28,college,0,1,0
...,...,...,...,...,...,...,...,...
495,1000,30,December,28,High School or Below,1,0,1
496,1000,15,July,26,High School or Below,1,0,1
497,800,15,June,30,college,1,0,1
498,1000,30,March,38,college,1,1,0


In [22]:
loans_binary_encoded = pd.get_dummies(loans_df, columns=['education','gender'])
loans_binary_encoded

Unnamed: 0,amount,term,month,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,June,45,0,0,1,0,0,0,1
1,1000,30,July,50,0,1,0,0,0,1,0
2,1000,30,August,33,0,1,0,0,0,1,0
3,1000,15,September,27,0,0,0,0,1,0,1
4,1000,30,October,28,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
495,1000,30,December,28,1,0,1,0,0,0,1
496,1000,15,July,26,1,0,1,0,0,0,1
497,800,15,June,30,1,0,0,0,1,0,1
498,1000,30,March,38,1,0,0,0,1,1,0


In [23]:
# scikit-learn offers another way to encode your labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = loans_df.copy()
df2['education'] = le.fit_transform(df2['education'])
# fit-transform() method is used to first train the label encoder, then convert the text data into numerical data
df2.head()

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,1,male,0
1,1000,30,July,50,0,female,0
2,1000,30,August,33,0,female,0
3,1000,15,September,27,3,male,0
4,1000,30,October,28,3,female,0


# Create custom encoding


In [24]:
# transform the month columns into numbers
label_encoder = LabelEncoder()
loans_df['month_le'] = label_encoder.fit_transform(loans_df['month'])
loans_df.head()
# the month_le is converted wrong

Unnamed: 0,amount,term,month,age,education,gender,bad,month_le
0,1000,30,June,45,High School or Below,male,0,6
1,1000,30,July,50,Bachelor,female,0,5
2,1000,30,August,33,Bachelor,female,0,1
3,1000,15,September,27,college,male,0,11
4,1000,30,October,28,college,female,0,10


In [25]:
months_num = {
   "January": 1,
   "February": 2,
   "March": 3,
   "April": 4,
   "May": 5,
   "June": 6,
   "July": 7,
   "August": 8,
   "September": 9,
   "October": 10,
   "November": 11,
   "December": 12,
}


In [26]:
# a lamda function is applied to the month
loans_df['month_num'] = loans_df['month'].apply(lambda x: months_num[x])
loans_df.head()
# the transformed values are placed in the month_num column
# apply() method runs the function inside its parentheses on each element of the month column
# the lambda function takes an argument(x) and returns months_num[x]
# for example, if the valye in the month column is June, the function returns months_num['June'] which is 6

Unnamed: 0,amount,term,month,age,education,gender,bad,month_le,month_num
0,1000,30,June,45,High School or Below,male,0,6,6
1,1000,30,July,50,Bachelor,female,0,5,7
2,1000,30,August,33,Bachelor,female,0,1,8
3,1000,15,September,27,college,male,0,11,9
4,1000,30,October,28,college,female,0,10,10


In [27]:
# drop unnecessary columns related to the month
loans_df = loans_df.drop(['month','month_le'], axis=1)
loans_df['education']=le.fit_transform(loans_df['education'])
loans_df.head()

Unnamed: 0,amount,term,age,education,gender,bad,month_num
0,1000,30,45,1,male,0,6
1,1000,30,50,0,female,0,7
2,1000,30,33,0,female,0,8
3,1000,15,27,3,male,0,9
4,1000,30,28,3,female,0,10


# Scale and Normalize Data

In [28]:
import pandas as pd
from path import Path

encoded_df = pd.read_csv('Resources/Loans_data_encoded.csv')
encoded_df.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [29]:
# to scale the data in this dataFrame
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [30]:
# train the scaler and transform the data (combined two steps together here)
loans_data_scaled = data_scaler.fit_transform(encoded_df)

In [31]:
loans_data_scaled[:5]

array([[ 0.49337687,  0.89789115,  2.28404253, -0.81649658, -0.16890147,
        -0.39336295,  1.17997648, -0.08980265, -0.88640526, -0.42665337,
         0.42665337],
       [ 0.49337687,  0.89789115,  3.10658738, -0.81649658,  0.12951102,
         2.54218146, -0.84747452, -0.08980265, -0.88640526,  2.34382305,
        -2.34382305],
       [ 0.49337687,  0.89789115,  0.3099349 , -0.81649658,  0.42792352,
         2.54218146, -0.84747452, -0.08980265, -0.88640526,  2.34382305,
        -2.34382305],
       [ 0.49337687, -0.97897162, -0.67711892, -0.81649658,  0.72633602,
        -0.39336295, -0.84747452, -0.08980265,  1.12815215, -0.42665337,
         0.42665337],
       [ 0.49337687,  0.89789115, -0.51260995, -0.81649658,  1.02474851,
        -0.39336295, -0.84747452, -0.08980265,  1.12815215,  2.34382305,
        -2.34382305]])

In [33]:
import numpy as np
print(np.mean(loans_data_scaled[:,0]))
print(np.std(loans_data_scaled[:,0]))
#[:,0] returns all rows and the first column of the dataset

-3.552713678800501e-16
0.9999999999999999
