In [41]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [42]:
df = pd.read_csv("../datasets/loan_dataset/loan_train.csv")
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,Male,No,0,Graduate,No,584900,0.0,15000000,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,458300,150800.0,12800000,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,300000,0.0,6600000,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,258300,235800.0,12000000,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,600000,0.0,14100000,360.0,1.0,Urban,Y


Review the data

In [43]:
df.shape

(614, 12)

In [44]:
df.describe()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History
count,614.0,614.0,614.0,600.0,564.0
mean,540345.9,162124.6,14141040.0,342.0,0.842199
std,610904.2,292624.8,8815682.0,65.12041,0.364878
min,15000.0,0.0,0.0,12.0,0.0
25%,287750.0,0.0,9800000.0,360.0,1.0
50%,381250.0,118850.0,12500000.0,360.0,1.0
75%,579500.0,229725.0,16475000.0,360.0,1.0
max,8100000.0,4166700.0,70000000.0,480.0,1.0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Gender              601 non-null    object 
 1   Married             611 non-null    object 
 2   Dependents          599 non-null    object 
 3   Education           614 non-null    object 
 4   Self_Employed       582 non-null    object 
 5   Applicant_Income    614 non-null    int64  
 6   Coapplicant_Income  614 non-null    float64
 7   Loan_Amount         614 non-null    int64  
 8   Term                600 non-null    float64
 9   Credit_History      564 non-null    float64
 10  Area                614 non-null    object 
 11  Status              614 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 57.7+ KB


In [46]:
df.corr()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History
Applicant_Income,1.0,-0.116605,0.539615,-0.045306,-0.014715
Coapplicant_Income,-0.116605,1.0,0.189237,-0.059878,-0.002056
Loan_Amount,0.539615,0.189237,1.0,0.041403,-0.000863
Term,-0.045306,-0.059878,0.041403,1.0,0.00147
Credit_History,-0.014715,-0.002056,-0.000863,0.00147,1.0


In [48]:
df.isna().sum()

Gender                13
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                  14
Credit_History        50
Area                   0
Status                 0
dtype: int64

We can either drop or fill the missing data. Since this set is so small will opt to fill. 

In [49]:
for i in df.columns:
    df[i].fillna(df[i].mode()[0], inplace = True)

In [50]:
df.isna().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
Applicant_Income      0
Coapplicant_Income    0
Loan_Amount           0
Term                  0
Credit_History        0
Area                  0
Status                0
dtype: int64

In [51]:
# just discovered this get_dummies function, for casting categorical data into numerical. Faster than mapping over a dict. 
df = pd.get_dummies(df, columns=['Dependents','Gender','Married','Education','Self_Employed','Area','Status'])

In [52]:
df.corr()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Gender_Female,...,Married_Yes,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Area_Rural,Area_Semiurban,Area_Urban,Status_N,Status_Y
Applicant_Income,1.0,-0.116605,0.539615,-0.046531,-0.018615,-0.092599,0.040861,-0.03465,0.156687,-0.058809,...,0.051708,0.14076,-0.14076,-0.12718,0.12718,0.015829,-0.014246,-0.000598,0.00471,-0.00471
Coapplicant_Income,-0.116605,1.0,0.189237,-0.059383,0.011134,-0.008292,-0.029769,0.010016,0.041491,-0.082912,...,0.075948,0.06229,-0.06229,0.0161,-0.0161,0.005329,-0.027044,0.022776,0.059187,-0.059187
Loan_Amount,0.539615,0.189237,1.0,0.03944,0.006015,-0.139013,0.055696,0.021922,0.143513,-0.097095,...,0.132866,0.17507,-0.17507,-0.108293,0.108293,0.041845,0.008625,-0.049384,0.008782,-0.008782
Term,-0.046531,-0.059383,0.03944,1.0,-0.004705,0.118163,-0.088492,-0.010609,-0.077273,0.07403,...,-0.100912,0.073928,-0.073928,0.033739,-0.033739,0.034321,0.059141,-0.094279,0.022549,-0.022549
Credit_History,-0.018615,0.011134,0.006015,-0.004705,1.0,0.0205,0.009757,0.007987,-0.060473,-0.00917,...,0.010938,0.073658,-0.073658,0.00155,-0.00155,-0.020906,0.035976,-0.016934,-0.540556,0.540556
Dependents_0,-0.092599,-0.008292,-0.139013,0.118163,0.0205,1.0,-0.531373,-0.528246,-0.358315,0.148421,...,-0.348175,0.036563,-0.036563,0.088254,-0.088254,0.044015,-0.004173,-0.038264,0.003044,-0.003044
Dependents_1,0.040861,-0.029769,0.055696,-0.088492,0.009757,-0.531373,1.0,-0.198046,-0.134337,0.004466,...,0.113853,0.013355,-0.013355,-0.082044,0.082044,-0.084117,0.011661,0.06932,0.03874,-0.03874
Dependents_2,-0.03465,0.010016,0.021922,-0.010609,0.007987,-0.528246,-0.198046,1.0,-0.133547,-0.129953,...,0.249547,-0.020822,0.020822,-0.032434,0.032434,-0.004298,-0.012017,0.016569,-0.062384,0.062384
Dependents_3+,0.156687,0.041491,0.143513,-0.077273,-0.060473,-0.358315,-0.134337,-0.133547,1.0,-0.096319,...,0.132566,-0.055288,0.055288,-0.003278,0.003278,0.04067,0.007863,-0.04746,0.026123,-0.026123
Gender_Female,-0.058809,-0.082912,-0.097095,0.07403,-0.00917,0.148421,0.004466,-0.129953,-0.096319,1.0,...,-0.364569,0.045364,-0.045364,-0.000525,0.000525,-0.080283,0.108623,-0.03453,0.017987,-0.017987


Interesting note, Applicant_Income has a nearly neutral correlation with approval or not, but a higher correlation with loan amount. So it plays very little role in whether or not the loan is approved but instead influences the size of the loan. 

In [53]:
X =  np.array(df.iloc[:,:-2])
y =  np.array(df.iloc[:,-1])

In [54]:
df_test = pd.read_csv("../datasets/loan_dataset/loan_test.csv")
df_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area
0,Male,Yes,0,Graduate,No,572000,0,11000000,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,307600,150000,12600000,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,500000,180000,20800000,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,234000,254600,10000000,360.0,,Urban
4,Male,No,0,Not Graduate,No,327600,0,7800000,360.0,1.0,Urban


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [56]:
minmax = MinMaxScaler()
x_train = minmax.fit_transform(X_train)
x_test = minmax.transform(X_test)

In [57]:
logR = LogisticRegression(solver = 'saga', max_iter = 500, random_state = 1)
logR.fit(x_train, y_train)

LogisticRegression(max_iter=500, random_state=1, solver='saga')

In [58]:
preds = logR.predict(x_test)

In [59]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.41      0.57        39
           1       0.78      0.99      0.87        84

    accuracy                           0.80       123
   macro avg       0.86      0.70      0.72       123
weighted avg       0.83      0.80      0.78       123



In [60]:
print(accuracy_score(preds,y_test)*100)

80.48780487804879


In [61]:
preds[0]

1

In [62]:
print(df_test.iloc[0])

Gender                    Male
Married                    Yes
Dependents                   0
Education             Graduate
Self_Employed               No
Applicant_Income        572000
Coapplicant_Income           0
Loan_Amount           11000000
Term                     360.0
Credit_History             1.0
Area                     Urban
Name: 0, dtype: object


In [63]:
df_test.isna().sum()

Gender                11
Married                0
Dependents            10
Education              0
Self_Employed         23
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                   6
Credit_History        29
Area                   0
dtype: int64

In [64]:
for i in df_test.columns:
    df_test[i].fillna(df_test[i].mode()[0], inplace = True)

In [65]:
df_test = pd.get_dummies(df_test, columns=['Dependents','Gender','Married','Education','Self_Employed','Area'])

In [66]:
test_scaled = minmax.fit_transform(df_test)

In [67]:
test_preds = logR.predict(test_scaled)

In [68]:
test_preds[0]

1

### Accuracy score this way is 80%. Now what about cross validation. 

In [77]:
from sklearn.model_selection import cross_validate

model = LogisticRegression()
predictions = cross_validate(model, x_train, y_train, cv=5)

print(np.mean(predictions['test_score']))

0.8106575963718822


In [None]:
# only slightly better