In [1]:
import pandas as pd

In [2]:
# checking raw dataset
data_path = "data.csv"
df = pd.read_csv(data_path, on_bad_lines='skip').copy() #copy to avoid data changing in the original dataset
print(df.columns)
df.sample(n=10)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
1968,1969,2,Not Graduate,No,1700000,5300000,20,761,2500000,500000,3700000,1200000,Approved
927,928,3,Graduate,Yes,7200000,20400000,10,558,0,13200000,27200000,10600000,Approved
4223,4224,0,Not Graduate,No,2700000,8400000,14,610,900000,1900000,6100000,2800000,Approved
675,676,4,Not Graduate,Yes,4600000,9500000,20,613,5200000,5900000,13700000,3000000,Approved
2493,2494,3,Graduate,No,2000000,4200000,6,831,1300000,1700000,6900000,1200000,Approved
3602,3603,0,Not Graduate,Yes,1900000,6400000,20,704,900000,3400000,5300000,1900000,Approved
1870,1871,3,Not Graduate,Yes,8300000,20900000,4,615,4400000,200000,19000000,9600000,Approved
2312,2313,2,Graduate,No,2200000,7000000,6,777,6100000,400000,4400000,3300000,Approved
718,719,4,Graduate,No,700000,2600000,14,736,2100000,600000,1900000,400000,Approved
785,786,2,Not Graduate,Yes,600000,2100000,18,572,400000,500000,1400000,300000,Approved


In [3]:
# since column names include white spaces, they have to be trimmed.

df.columns = df.columns.str.strip()
print(df.columns)

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


In [4]:
# one hot encoding for finite enumeration features
df = pd.get_dummies(df, columns=['education', 'self_employed',])
df.sample(n=10)

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,education_ Graduate,education_ Not Graduate,self_employed_ No,self_employed_ Yes
1410,1411,2,9500000,25400000,16,451,19000000,2900000,33800000,11400000,Rejected,False,True,True,False
647,648,0,3000000,9600000,12,620,7900000,0,8600000,3600000,Approved,True,False,True,False
1938,1939,4,1200000,4800000,2,752,2800000,900000,3000000,600000,Approved,True,False,False,True
3520,3521,3,6900000,22700000,4,650,900000,13400000,15500000,6400000,Approved,False,True,True,False
1477,1478,4,1200000,3300000,12,876,400000,100000,3800000,1000000,Approved,False,True,True,False
2503,2504,2,5500000,19600000,16,328,2400000,4800000,10900000,3700000,Rejected,False,True,True,False
1022,1023,5,8800000,18800000,14,836,17300000,10100000,32700000,5400000,Approved,True,False,True,False
2303,2304,2,8400000,32600000,10,384,13900000,1600000,21600000,7000000,Rejected,True,False,False,True
765,766,0,7900000,31500000,14,849,19400000,10300000,26600000,10600000,Approved,False,True,True,False
1901,1902,0,4000000,12500000,18,868,11000000,800000,14400000,2300000,Approved,False,True,True,False


In [5]:
# label encoding target value (loan_status)
from sklearn.calibration import LabelEncoder


label_encoder = LabelEncoder()
df['loan_status'] = label_encoder.fit_transform(df['loan_status'])

df.sample(n=10)

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,education_ Graduate,education_ Not Graduate,self_employed_ No,self_employed_ Yes
3895,3896,0,1800000,5200000,16,317,4900000,1500000,4000000,1300000,1,True,False,False,True
4035,4036,1,7300000,19200000,14,599,18400000,5400000,15900000,3700000,0,False,True,True,False
257,258,2,8000000,20000000,2,502,4100000,2200000,31600000,9900000,1,False,True,True,False
3310,3311,5,8500000,23400000,16,604,25100000,16500000,24500000,4800000,0,False,True,True,False
2937,2938,1,2200000,5400000,16,641,5900000,0,4800000,1200000,0,False,True,False,True
2457,2458,1,2100000,4300000,20,896,100000,2300000,7500000,2300000,0,True,False,False,True
87,88,1,8700000,28300000,8,402,20400000,13600000,27900000,10200000,1,False,True,False,True
3457,3458,4,8600000,20900000,10,867,24300000,7200000,23700000,9900000,0,False,True,True,False
1618,1619,1,5300000,14100000,14,510,13300000,10400000,18900000,7800000,1,False,True,False,True
2582,2583,0,6800000,21700000,20,730,17500000,1300000,18600000,3600000,0,True,False,True,False


In [6]:
# scaling (standardization) numeric features
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()


scaled_data = standard_scaler.fit_transform(df)

new_df = pd.DataFrame(scaled_data, columns=df.columns)

#moving the target column to last index for better visualization
new_df = new_df.drop(columns=['loan_status'])
new_df['loan_status'] = df['loan_status']
df = new_df.copy()
df.sample(n=10)

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_ Graduate,education_ Not Graduate,self_employed_ No,self_employed_ Yes,loan_status
3253,0.908018,0.885344,-1.695744,-1.574096,-1.208798,1.142995,-1.102992,-1.042088,-1.595826,-1.408298,0.995559,-0.995559,-0.992765,0.992765,0
2907,0.627255,1.475067,0.940982,0.571375,0.192617,1.195196,2.126349,1.897445,0.414569,1.422642,-1.004461,1.004461,1.007288,-1.007288,0
2448,0.254797,1.475067,0.370879,-0.081113,0.893324,-1.159653,0.265634,-0.495199,0.952872,0.191798,0.995559,-0.995559,-0.992765,0.992765,1
697,-1.166061,-1.473548,0.335248,0.969504,-1.208798,-1.194454,-0.118812,0.188414,0.744142,-0.146683,0.995559,-0.995559,-0.992765,0.992765,0
307,-1.482528,0.295621,-1.303798,-1.220203,-0.508091,-0.840647,-0.687791,-0.814218,-1.343153,-0.885189,-1.004461,1.004461,1.007288,-1.007288,1
1606,-0.428448,0.295621,-1.481955,-1.330795,-0.157737,-1.339457,-0.933836,-1.11045,-1.474983,-1.377527,0.995559,-0.995559,1.007288,-1.007288,1
2592,0.371646,-0.294102,-0.626801,-0.37971,0.54297,-0.933449,-0.534013,0.302349,-0.749922,-1.008274,-1.004461,1.004461,1.007288,-1.007288,1
2794,0.53556,0.295621,0.263985,0.272779,-0.858444,0.696385,-0.395612,-0.17618,0.403583,0.499509,0.995559,-0.995559,-0.992765,0.992765,0
2568,0.352172,0.295621,-0.234855,0.12901,1.594031,-0.225834,-0.303345,0.689729,-0.343449,-0.300539,-1.004461,1.004461,-0.992765,0.992765,0
3590,1.181479,0.295621,-0.377381,-0.390769,-0.858444,-0.353436,-0.241834,-0.540773,-0.332463,-0.085141,-1.004461,1.004461,-0.992765,0.992765,1


In [7]:
# save the cleaned data into new csv file
df.to_csv("processed_data.csv",index=False)