In [1]:
# Run some setup code for this notebook.

import matplotlib.pyplot as plt


# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## general description of this dataset

|Field    |description                      |   
|:----------|:----------                     |   
|Loan_ID   |ID of the Loan |
|Gender |Female or Male|
|Dependents	|the number of relatives|
|Education	|received education or not|
|Self_Employed	|be employed or not|
|ApplicantIncome	|the income of the applicant|
|CoapplicantIncome	|the income of co-applicant|
|LoanAmount|the amount of the loan|
|Loan_Amount_Term|loan amount item|
|Credit_History	|have a good credit or not|
|Property_Area|Urban or Rural or Semiurban|
|Loan_Status|have a loan or not|

## Load Dataset

In [2]:
import random

In [3]:
import pandas as pd
df = pd.read_csv('loan.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Dataset Info

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
print('the number of records:')
df['Loan_ID'].count()

the number of records:


614

## Data Cleaning

In [6]:
df.drop("Loan_ID", axis=1, inplace=True)
# Checking the Missing Values
print('Part of missing values for every column')
print(df.isnull().sum() / len(df))

Part of missing values for every column
Gender               0.021173
Married              0.004886
Dependents           0.024430
Education            0.000000
Self_Employed        0.052117
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.035831
Loan_Amount_Term     0.022801
Credit_History       0.081433
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64


In [7]:
##the porpotion of NAN is extremely small ,we choose to drop all of data that includes NAN
df=df.dropna()
## reset index
df.index=range(480)

## Encode 

In [8]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [9]:
map_Married_Self_Employed={
    'Yes': 1,
    'No' : 0 
}
map_Gender={
    'Male': 0,
    'Female': 1
}
map_Education={
    'Graduate': 1,
    'Not Graduate': 0
}
map_Property_Area={
    'Rural': -1,
    'Semiurban': 0,
    'Urban': 1
}
map_Loan_Status={
    'Y': 1,
    'N': 0
}
map_Dependents={
    '0': 0,
    '1': 1,
    '2': 2,
    '3+': 3,
}##3+ can change to other values
df['Married']=df['Married'].map(map_Married_Self_Employed)
df['Self_Employed']=df['Self_Employed'].map(map_Married_Self_Employed)
df['Education']=df['Education'].map(map_Education)
df['Property_Area']=df['Property_Area'].map(map_Property_Area)
df['Loan_Status']=df['Loan_Status'].map(map_Loan_Status)
df['Dependents']=df['Dependents'].map(map_Dependents)
df['Gender']=df['Gender'].map(map_Gender)

In [10]:
#normaliz
import numpy as np
def z_score_normalize(data):    
    mean = np.mean(data, axis=0)    
    std_dev = np.std(data, axis=0)    
    normalized_data = (data - mean) / std_dev    
    return normalized_data

In [11]:
df.iloc[:,:-1]=z_score_normalize(df.iloc[:,:-1]) #数据z_score normalize

## Data process

In [12]:
print(len(df[df['Loan_Status']==1]),len(df[df['Loan_Status']==0])) #正负样本个数

332 148


In [13]:
df_Y=df[df['Loan_Status']==1].copy(deep=True)
df_N=df[df['Loan_Status']==0].copy(deep=True)

In [14]:
df_Y.index=range(len(df_Y))
df_N.index=range(len(df_N))

In [15]:
def split_dataset(df,ratio):
    train_size = int(len(df)*ratio)
    random_list=random.sample(range(len(df)),train_size)
    random_list_U=list(range(len(df)))
    for i in range(len(random_list)) :
        random_list_U.remove(random_list[i])
    test_tar_df=df.iloc[random_list].copy(deep=True)
    train_tar_df=df.iloc[random_list_U].copy(deep=True)
    return [train_tar_df,test_tar_df]

In [16]:
list_Y=split_dataset(df_Y,0.3)
list_N=split_dataset(df_N,0.3)
train_Y_df=list_Y[0]
test_Y_df=list_Y[1]
train_N_df=list_N[0]
test_N_df=list_N[1]

In [17]:
train_df=pd.concat([train_N_df,train_Y_df]).copy(deep=True)
test_df=pd.concat([test_N_df,test_Y_df]).copy(deep=True)

In [18]:
train_df.index=range(len(train_df))
test_df.index=range(len(test_df))

## Train

In [43]:
def loss(df,beta) :
    len_=len(df)
    loss=0
    for i in range(len_) :
        temp_arr=np.array(df.iloc[i,:-1])
        temp_arr_1=np.append(temp_arr,1)
        loss=loss-df['Loan_Status'][i]*np.dot(beta,temp_arr_1)+np.log(1+np.exp(np.dot(beta,temp_arr_1)))
    return loss

In [25]:
def sigmod_s(z) :
    if z>0 :
        return 1/(1+np.exp(-z))
    else :
        return np.exp(z)/(1+np.exp(z))

In [20]:
def p_1(beta,x) :
    x=np.append(x,1)
    z=np.dot(beta,x)
    return sigmod_s(z)

In [21]:
def L_beta_1(df,beta) :
    len_=len(df.columns)
    len_index=len(df)
    sum_=np.array(len_*[0])
    for i in range(len_index) :
        temp_arr=np.array(df.iloc[i,:-1])
        temp_arr_1=np.append(temp_arr,1)
        sum_=sum_-(df['Loan_Status'][i]-p_1(beta,temp_arr))*temp_arr_1
    return [sum_]

In [22]:
def L_beta_2(df,beta) :
    len_=len(df.columns)
    len_index=len(df)
    sum_=np.array(len_*[[0]*len_])
    for i in range(len_index) :
        temp_arr=np.array(df.iloc[i,:-1])
        temp_arr_1=np.append(temp_arr,1)
        sum_=sum_+p_1(beta,temp_arr)*(1-p_1(beta,temp_arr))*np.dot(np.array([temp_arr_1]).T,np.array([temp_arr_1]))
    return sum_

In [23]:
def LogisticRegression_train(df,alph) :
    beta=np.array([1]*(len(df.columns)))
    beta_new=np.array([0]*(len(df.columns)))
    while True :
        beta_new=(np.array([beta])-alph*np.dot(L_beta_1(df,beta),np.linalg.inv(np.array(L_beta_2(df,beta).T))))[0]
        if np.linalg.norm(beta_new-beta)<1e-6 :
            break
        print(loss(beta))
        beta=beta_new.copy()
    return beta

In [26]:
beta=LogisticRegression_train(train_df,0.2)

3.281570301527995
0.5198883691134591
0.25193727424548673
0.15382240249178844
0.11521782502143778
0.09243807018145203
0.07599486291492824
0.0629781979927029
0.052239404172817805
0.043235109169369436
0.03565669930337368
0.029291922867160952
0.023971217535537936
0.019547668986693755
0.015890210360526884
0.012881667786545129
0.010418262914412616
0.00840928134599956
0.006776493799932385
0.005453276608973278
0.0043835100895661065
0.0035203635876535624
0.00282506371058846
0.002265716133308928
0.001816224794103557
0.001455330573860585
0.0011657759114174675
0.000933591653680119
0.0007474965641850987
0.0005983970763538233
0.0004789740373022373
0.0003833435612061706
0.00030678014959097967
0.00024549157799659224
0.00019643646952013916
0.00015717684549540863
0.0001257591939947489
0.00010061869900584987
8.050222157239193e-05
6.44064263664189e-05
5.152811713371016e-05
4.1224398685801024e-05
3.298073829544435e-05
2.6385371100358846e-05
2.1108796419198346e-05
1.6887356861792323e-05
1.351009012550608e-0

In [27]:
def cal_test(df,beta) :
    len_=len(df)
    predict=[]
    for i in range(len_) :
        temp_arr=np.array(df.iloc[i,:-1])
        flag=p_1(beta,temp_arr)
        if flag>=0.5 :
            if df['Loan_Status'][i] == 1 :
                predict.append([1,1])
            else :
                predict.append([1,0])
        else :
            if df['Loan_Status'][i] == 1 :
                predict.append([0,1])
            else :
                predict.append([0,0])
    return predict

In [33]:
def get_accurate(predict) :
    accurate=0
    predict=cal_test(test_df,beta)
    for i in range(len(predict)) :
        if predict[i][0] == predict[i][1] :
            accurate+=1
    return accurate/len(predict)

In [36]:
def get_P_score(predict) :
    TP,FP,FN=0,0,0
    for i in range(len(predict)) :
        if predict[i] == [1,1] :
            TP+=1
        elif predict[i] == [1,0] :
            FP+=1
        elif predict[i] == [0,1] :
            FN+=1
    return TP/(TP+FP)

In [35]:
def get_R_score(predict) :
    TP,FP,FN=0,0,0
    for i in range(len(predict)) :
        if predict[i] == [1,1] :
            TP+=1
        elif predict[i] == [1,0] :
            FP+=1
        elif predict[i] == [0,1] :
            FN+=1
    return TP/(TP+FN)

In [40]:
def f1_score(predict) :
    R=get_R_score(predict)
    P=get_P_score(predict)
    return 2*P*R/(P+R)

In [37]:
get_accurate(predict)

0.8391608391608392

In [38]:
get_P_score(predict)

0.8166666666666667

In [39]:
get_R_score(predict)

0.98989898989899

In [41]:
f1_score(predict)

0.8949771689497718

In [171]:
from Logistic import LogisticRegression
import matplotlib.pyplot as plt

################################################################################
# TODO:                                                                        #
# Task4 train your model and plot the loss curve of training
# You need to complete the Logistic.py file
# model = LogisticRegression()
################################################################################


################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

## Test

In [None]:
################################################################################
# TODO:                                                                        #
# Task5 compare the accuracy(or other metrics you want) of test data with different parameters you train with
################################################################################




################################################################################
#                                 END OF YOUR CODE                             #
################################################################################