## Load Dataset

In [1704]:
import pandas as pd
import random as rd
import numpy as np
df = pd.read_csv('loan.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Dataset Info

In [1705]:
# df.info()
df['Gender'].describe()

count      601
unique       2
top       Male
freq       489
Name: Gender, dtype: object

## PART1: Data Cleaning

In [1706]:
# Task1 deal with NULL rows, you can either choose to drop them or replace them with mean or other value 

# Checking the Missing Values

# A function used to fill missing data from the same column randomly
def random_FillMissingData(_df: pd.DataFrame) -> pd.DataFrame:
    df = _df.copy()
    for column in df.columns:
        data = df[column].copy()
        empty_rows = data.isnull()
        data[empty_rows] = rd.choices(data[~empty_rows].values, k = empty_rows.sum())
        df[column] = data
    return df

df = random_FillMissingData(df)

# DEBUG ONLY
# df.to_csv("./temp/temp1.csv", index=False) 
# print(df['Self_Employed'].value_counts())
# print(df.isnull().sum())

## PART2: Encode 




In [1707]:
# Task2 deal with categorical features

df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
df['Dependents'] = df['Dependents'].map({'0': 0, '1': 1, '2': 2, '3+': 3})
df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate': 0})
df['Property_Area'] = df['Property_Area'].map({'Rural': 0, 'Semiurban': 1, 'Urban': 2})
df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0})
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})
# df.drop(df.columns[[0,1,2,3,4,5,6,7,8,9,11]], axis=1, inplace=True)
df.drop(df.columns[[0]], axis=1, inplace=True)

df = df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))).round(4)

# DEBUG ONLY
# df.to_csv("./temp/temp2.csv", index=False) 


## PART3: Data Process

In [1708]:
# Task3 split the dataset into X_train, X_test, y_train, y_test
# Optional: you can also use normalization

def random_Split_data(data: pd.DataFrame, rate = 0.75):
    m, n = data.shape
    data.reindex(np.random.permutation(data.index))

    row_split = int(m * rate)
    X_train = df.iloc[0: row_split, 0: n - 1].values
    y_train = df.iloc[0: row_split, n - 1: ].values
    X_test = df.iloc[row_split: m, 0: n - 1].values
    y_test = df.iloc[row_split: m, n - 1: ].values
    
    return X_train, y_train, X_test, y_test



X_train, y_train, X_test, y_test = random_Split_data(df, rate=0.8)


## Train

In [1709]:
# Task4 train your model and plot the loss curve of training
from Logistic import LogisticRegression
import matplotlib.pyplot as plt


lr = LogisticRegression()
loss = lr.fit(X_train, y_train, lr=0.005)
pred = lr.predict(X_test)

# DEBUG ONLY
# print(lr.w)
# print(loss)
# print(pred)
# print(lr.w)

## Test

In [1710]:
# Task5 compare the accuracy(or other metrics you want) of test data with different parameters you train with
sum = 0
count = 0
for i in range(len(y_test)):
    if y_test[i] == pred[i]:
        count += 1
    sum += 1

print(count / sum)


0.7723577235772358
