## Dataset Link: https://www.kaggle.com/datasets/altruistdelhite04/loan-prediction-problem-dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#This dataset already comes up with train and test folder so no need to split later

train_df = pd.read_csv('/content/train_u6lujuX_CVtuZ9i.csv')
test_df = pd.read_csv('/content/test_Y3wMUE5_7gLdaTN.csv')
train_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [None]:
train_df = train_df.drop("Loan_ID", axis=1)
test_df = test_df.drop("Loan_ID", axis=1)

In [None]:
from IPython.display import display #allows multiple output in a single cell

display(train_df.isnull().sum())
print("="*37)
display(test_df.isnull().sum())

Unnamed: 0,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14
Credit_History,50




Unnamed: 0,0
Gender,11
Married,0
Dependents,10
Education,0
Self_Employed,23
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,5
Loan_Amount_Term,6
Credit_History,29


## Now, we will handle missing values..use mode for categorical columns, and median for numerical ones

In [44]:
cat_cols = train_df.select_dtypes(include="object").columns.tolist()
cat_cols.remove("Loan_Status")  # exclude target
for col in cat_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])


num_cols = train_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
for col in num_cols:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(test_df[col].median())

In [45]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_cols + ["Loan_Status"]:  # include target
    train_df[col] = le.fit_transform(train_df[col])
    if col in test_df.columns:
        test_df[col] = le.transform(test_df[col])

In [46]:
train_df.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1


In [48]:
test_df.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2


### Splitting

In [82]:
X_train = train_df.drop("Loan_Status", axis=1)
y_train = train_df["Loan_Status"]

X_test = test_df.copy()

In [83]:
train_df['Loan_Status'].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
1,422
0,192


## Now, time to apply SMOTE due to imbalanced dataset as seen just above

In [84]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print('='*29)
print("After SMOTE:\n", pd.Series(y_resampled).value_counts())

Before SMOTE:
 Loan_Status
1    422
0    192
Name: count, dtype: int64
After SMOTE:
 Loan_Status
1    422
0    422
Name: count, dtype: int64


In [85]:
X_resampled.info() #see that dataset count has reached 844 from 422...smote added (422-192) synthetic data for 0 class.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844 entries, 0 to 843
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             844 non-null    int64  
 1   Married            844 non-null    int64  
 2   Dependents         844 non-null    int64  
 3   Education          844 non-null    int64  
 4   Self_Employed      844 non-null    int64  
 5   ApplicantIncome    844 non-null    int64  
 6   CoapplicantIncome  844 non-null    float64
 7   LoanAmount         844 non-null    float64
 8   Loan_Amount_Term   844 non-null    float64
 9   Credit_History     844 non-null    float64
 10  Property_Area      844 non-null    int64  
dtypes: float64(4), int64(7)
memory usage: 72.7 KB


In [86]:
X_resampled.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,2
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2


In [87]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

num_cols = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]

X_resampled[num_cols] = scaler.fit_transform(X_resampled[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [88]:
X_resampled = pd.DataFrame(X_resampled, columns=X_train.columns)
X_resampled.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,0.111702,-0.58461,-0.208439,0.287946,1.0,2
1,1,1,1,0,0,-0.119063,-0.024552,-0.208439,0.287946,1.0,0


## Time to Train

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [90]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
log_reg.fit(X_resampled, y_resampled)

# Decision Tree
dt_clf = DecisionTreeClassifier(max_depth=5, class_weight="balanced", random_state=42)
dt_clf.fit(X_resampled, y_resampled)

## Now, as we don't have target label in test data, we will test on train data and later try to predict on test data

In [95]:
print("\nLogistic Regression Results:")
y_pred_log = log_reg.predict(X_train)
print(classification_report(y_train, y_pred_log))

print('+='*33)

print("\nDecision Tree Results:")
y_pred_dt = dt_clf.predict(X_train)
print(classification_report(y_train, y_pred_dt))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.43      0.10      0.16       192
           1       0.70      0.94      0.80       422

    accuracy                           0.68       614
   macro avg       0.56      0.52      0.48       614
weighted avg       0.61      0.68      0.60       614

+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=

Decision Tree Results:
              precision    recall  f1-score   support

           0       0.22      0.49      0.31       192
           1       0.50      0.23      0.31       422

    accuracy                           0.31       614
   macro avg       0.36      0.36      0.31       614
weighted avg       0.41      0.31      0.31       614



In [96]:
test_predictions_log = log_reg.predict(X_test)
test_predictions_dt = dt_clf.predict(X_test)

# Save predictions
pd.DataFrame({
    "Loan_Status_LogReg": test_predictions_log,
    "Loan_Status_DT": test_predictions_dt
}).to_csv("loan_predictions.csv", index=False)

In [97]:
result = pd.read_csv('/content/loan_predictions.csv')
result.head()

Unnamed: 0,Loan_Status_LogReg,Loan_Status_DT
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [101]:
numeric_features = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]
categorical_features = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    # Fit on combined data to handle all possible categories
    le.fit(pd.concat([train_df[col], test_df[col]]).astype(str))
    label_encoders[col] = le

le_loan_status = LabelEncoder()
le_loan_status.fit(train_df["Loan_Status"].astype(str))
label_encoders["Loan_Status"] = le_loan_status

scaler = StandardScaler()

X_resampled[numeric_features] = scaler.fit_transform(X_resampled[numeric_features])

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
log_reg.fit(X_resampled, y_resampled)

training_columns = X_resampled.columns

def predict_loan_status():
    user_input = {}

    for col in numeric_features:
        val = float(input(f"Enter value for {col}: "))
        user_input[col] = val

    for col in categorical_features:
        val = input(f"Enter value for {col} ({list(label_encoders[col].classes_)}): ")
        user_input[col] = val

    user_df = pd.DataFrame([user_input])

    for col in categorical_features:
        le = label_encoders[col]
        user_df[col] = le.transform(user_df[col].astype(str))

    user_df[numeric_features] = scaler.transform(user_df[numeric_features])

    user_df = user_df.reindex(columns=training_columns, fill_value=0)

    pred = log_reg.predict(user_df)[0]
    pred_label = label_encoders["Loan_Status"].inverse_transform([pred])[0]

    print(f"\nPredicted Loan Status: {pred_label}")

predict_loan_status()

Enter value for ApplicantIncome: 1160
Enter value for CoapplicantIncome: 0
Enter value for LoanAmount: 167
Enter value for Loan_Amount_Term: 360
Enter value for Credit_History: 1
Enter value for Gender (['0', '1']): 0
Enter value for Married (['0', '1']): 1
Enter value for Dependents (['0', '1', '2', '3']): 3
Enter value for Education (['0', '1']): 1
Enter value for Self_Employed (['0', '1']): 0
Enter value for Property_Area (['0', '1', '2']): 2

Predicted Loan Status: 1
