In [165]:
print("Hello World!")

Hello World!


In [166]:
# Import all the tools we need

# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


# we want our plots to appear inside the notebook
%matplotlib inline 


In [167]:
url = "https://raw.githubusercontent.com/tseth4/loan-approval-capstone/refs/heads/main/loan_sanction_train.csv"
df = pd.read_csv(url)
df.head().T

Unnamed: 0,0,1,2,3,4
Loan_ID,LP001002,LP001003,LP001005,LP001006,LP001008
Gender,Male,Male,Male,Male,Male
Married,No,Yes,Yes,Yes,No
Dependents,0,1,0,0,0
Education,Graduate,Graduate,Graduate,Not Graduate,Graduate
Self_Employed,No,No,Yes,No,No
ApplicantIncome,5849,4583,3000,2583,6000
CoapplicantIncome,0.0,1508.0,0.0,2358.0,0.0
LoanAmount,,128.0,66.0,120.0,141.0
Loan_Amount_Term,360.0,360.0,360.0,360.0,360.0


In [193]:
def preprocess_data(df, training=True):

    
    # 1. Add missing value indicators (if useful for modeling)
    df['Dependents_is_missing'] = df['Dependents'].isnull().astype(int)
    df['Loan_Amount_Term_is_missing'] = df['Loan_Amount_Term'].isnull().astype(int)
    df['Credit_History_is_missing'] = df['Credit_History'].isnull().astype(int)
    df['LoanAmount_is_missing'] = df['LoanAmount'].isnull().astype(int)

    # 2. Handle missing values
    df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
    df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
    df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])
    df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
    df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())  # Fill LoanAmount with median


    # 3. Convert "3+" in Dependents to 3 and make column numeric
    # if 'Dependents' in df.columns:
    #     df['Dependents'] = df['Dependents'].replace('3+', 3).astype(int)

    # 4. Encode categorical variables
    # Turning value into number codes
    categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = pd.Categorical(df[col]).codes  # Convert categories to numeric codes

    # 5. One-hot encode 'Property_Area' (if needed)
    # One hot encoding splits properta area values into their own columns so they can have binary values
    if 'Property_Area' in df.columns:
        df = pd.get_dummies(df, columns=['Property_Area'], drop_first=True)

    # 6. Convert target column 'Loan_Status' to binary (if training)
    # if training and 'Loan_Status' in df.columns:
    #     df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

    # 7. Remove the target column for test data
    if not training and 'Loan_Status' in df.columns:
        df = df.drop(columns=['Loan_Status'])

    return df

In [194]:
# INSPECT MISSING VALUES

df_train = preprocess_data(df)
df_train
# print(df.isnull().sum())


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Dependents_is_missing,Loan_Amount_Term_is_missing,Credit_History_is_missing,LoanAmount_is_missing,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,2,1,0,0,1,5849,0.0,128.0,360.0,1.0,1,0,0,0,0,False,True
1,LP001003,2,2,1,0,1,4583,1508.0,128.0,360.0,1.0,0,0,0,0,0,False,False
2,LP001005,2,2,0,0,2,3000,0.0,66.0,360.0,1.0,1,0,0,0,0,False,True
3,LP001006,2,2,0,1,1,2583,2358.0,120.0,360.0,1.0,1,0,0,0,0,False,True
4,LP001008,2,1,0,0,1,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,1,1,0,0,1,2900,0.0,71.0,360.0,1.0,1,0,0,0,0,False,False
610,LP002979,2,2,3,0,1,4106,0.0,40.0,180.0,1.0,1,0,0,0,0,False,False
611,LP002983,2,2,1,0,1,8072,240.0,253.0,360.0,1.0,1,0,0,0,0,False,True
612,LP002984,2,2,2,0,1,7583,0.0,187.0,360.0,1.0,1,0,0,0,0,False,True


In [195]:
print(df.isnull().sum())


Loan_ID                        0
Gender                         0
Married                        0
Dependents                     0
Education                      0
Self_Employed                  0
ApplicantIncome                0
CoapplicantIncome              0
LoanAmount                     0
Loan_Amount_Term               0
Credit_History                 0
Property_Area                  0
Loan_Status                    0
Dependents_is_missing          0
Loan_Amount_Term_is_missing    0
Credit_History_is_missing      0
LoanAmount_is_missing          0
dtype: int64


In [196]:
# X = df.drop(columns=['Loan_Status'])
# y = df['Loan_Status']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Loan_ID                      614 non-null    object 
 1   Gender                       614 non-null    int8   
 2   Married                      614 non-null    int8   
 3   Dependents                   614 non-null    int32  
 4   Education                    614 non-null    int8   
 5   Self_Employed                614 non-null    int8   
 6   ApplicantIncome              614 non-null    int64  
 7   CoapplicantIncome            614 non-null    float64
 8   LoanAmount                   614 non-null    float64
 9   Loan_Amount_Term             614 non-null    float64
 10  Credit_History               614 non-null    float64
 11  Property_Area                614 non-null    object 
 12  Loan_Status                  614 non-null    object 
 13  Dependents_is_missin