In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

# --- Data Loading and Preprocessing ---
df = pd.read_csv("loan_prediction.csv")

# Create log transformations and TotalIncome
df['loanAmount_log'] = np.log(df['LoanAmount'])
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['TotalIncome_log'] = np.log(df['TotalIncome'])

# Impute missing values
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
df['loanAmount_log'].fillna(df['loanAmount_log'].mean(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Select features (x) and target (y)
# Based on your original selection:
# Features: Gender, Married, Dependents, Education, Self_Employed,
# LoanAmount, Loan_Amount_Term, Credit_History, TotalIncome_log, loanAmount_log
# Adjusting based on your column indices:
# Original: df.iloc[:,np.r_[1:5,9:11,13:15]]
# This corresponds to: Gender, Married, Dependents, Education, Self_Employed,
# LoanAmount (Index 9), Credit_History (Index 10), Loan_Amount_Term (Index 13),
# TotalIncome_log (Index 14 if TotalIncome was before it, assuming Loan_ID is index 0)

# Let's explicitly define the features based on common sense for loan prediction
# and the transformations you've done.
# Assuming column order: Loan_ID, Gender, Married, Dependents, Education, Self_Employed,
#                        ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term,
#                        Credit_History, Property_Area, Loan_Status, loanAmount_log, TotalIncome, TotalIncome_log

# Based on your original df.iloc[:,np.r_[1:5,9:11,13:15]].values, these are the columns:
# Index 1: Gender
# Index 2: Married
# Index 3: Dependents
# Index 4: Education
# Index 9: LoanAmount
# Index 10: Credit_History
# Index 13: loanAmount_log (assuming it's added after Loan_Status which is index 12)
# Index 14: TotalIncome_log (assuming it's added after loanAmount_log)

# Let's redefine `x` to explicitly use the column names for clarity and
# to ensure the order is correct for user input.
# The features used for training are crucial for consistent user input.

# Create the full processed dataframe for consistency in feature extraction
df_processed = df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                   'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
                   'loanAmount_log', 'TotalIncome_log', 'Property_Area', 'Loan_Status']].copy()

# Ensure we use the same columns as your original `x` definition, but using names for clarity
# The original 'x' was: Gender, Married, Dependents, Education, Self_Employed, LoanAmount, Credit_History, Loan_Amount_Term, loanAmount_log, TotalIncome_log
# Let's map these to the indices after processing.
# Based on your original code: x= df.iloc[:,np.r_[1:5,9:11,13:15]].values
# This means:
# df.columns[1] = Gender
# df.columns[2] = Married
# df.columns[3] = Dependents
# df.columns[4] = Education
# df.columns[9] = LoanAmount
# df.columns[10] = Credit_History
# df.columns[13] = loanAmount_log (if 'Property_Area' is 11, 'Loan_Status' is 12, then 'loanAmount_log' is 13)
# df.columns[14] = TotalIncome_log (if 'loanAmount_log' is 13, then 'TotalIncome_log' is 14)

# Reconfirming the columns for `x`:
feature_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                'LoanAmount', 'Credit_History', 'Loan_Amount_Term', # These were 9, 10, 13 originally
                'loanAmount_log', 'TotalIncome_log'] # These were 13, 14 originally
# Property_Area (index 11 in original df) was not included in your 'x' based on the indexing.

x = df[feature_cols].values
y = df['Loan_Status'].values # Column 12 in your original code

# --- Data Splitting ---
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# --- Label Encoding ---
# Create separate LabelEncoders for training and custom prediction for consistency
le_gender = LabelEncoder()
le_married = LabelEncoder()
le_dependents = LabelEncoder()
le_education = LabelEncoder()
le_self_employed = LabelEncoder()
le_loan_status = LabelEncoder() # For y

# Fit LabelEncoders on the *entire dataset* for categorical features
# This ensures all possible categories are encoded consistently when new data comes in.
# For example, if 'Male' only appears in test, and not in train, fitting only on train would fail.

# Fit on all unique values from the original DataFrame columns
le_gender.fit(df['Gender'].unique())
le_married.fit(df['Married'].unique())
le_dependents.fit(df['Dependents'].unique())
le_education.fit(df['Education'].unique())
le_self_employed.fit(df['Self_Employed'].unique())
le_loan_status.fit(df['Loan_Status'].unique())

# Apply transformations to x_train and x_test
# Note: Your original code applied LabelEncoder_x.fit_transform(x_train[:,i]) for each feature
# then LabelEncoder_x.fit_transform(x_train[:,7]) again for the 8th feature.
# It's better to apply separate encoders or manage the indices carefully.
# Given the features selected:
# 0: Gender, 1: Married, 2: Dependents, 3: Education, 4: Self_Employed
# 5: LoanAmount (numeric), 6: Credit_History (numeric, but can be treated as categorical), 7: Loan_Amount_Term (numeric)
# 8: loanAmount_log (numeric), 9: TotalIncome_log (numeric)

# Let's apply specific encoders to specific columns based on the feature_cols list
# For x_train
x_train_copy = np.copy(x_train) # Create a copy to avoid modifying original array directly
x_test_copy = np.copy(x_test)   # Create a copy for test as well

x_train_copy[:, 0] = le_gender.transform(x_train_copy[:, 0])
x_train_copy[:, 1] = le_married.transform(x_train_copy[:, 1])
x_train_copy[:, 2] = le_dependents.transform(x_train_copy[:, 2])
x_train_copy[:, 3] = le_education.transform(x_train_copy[:, 3])
x_train_copy[:, 4] = le_self_employed.transform(x_train_copy[:, 4])

# For x_test
x_test_copy[:, 0] = le_gender.transform(x_test_copy[:, 0])
x_test_copy[:, 1] = le_married.transform(x_test_copy[:, 1])
x_test_copy[:, 2] = le_dependents.transform(x_test_copy[:, 2])
x_test_copy[:, 3] = le_education.transform(x_test_copy[:, 3])
x_test_copy[:, 4] = le_self_employed.transform(x_test_copy[:, 4])

y_train = le_loan_status.transform(y_train)
y_test = le_loan_status.transform(y_test)

# Convert numerical columns to float after label encoding
# Your original x_train[:,5], x_train[:,6], x_train[:,7] etc. would be numeric.
# Ensure these columns are of numeric type for StandardScaler
x_train_copy = x_train_copy.astype(float)
x_test_copy = x_test_copy.astype(float)


# --- Feature Scaling ---
ss = StandardScaler()
x_train_scaled = ss.fit_transform(x_train_copy)
x_test_scaled = ss.transform(x_test_copy) # Use transform, not fit_transform on test

# --- Model Training and Evaluation ---
print("--- Model Performance ---")

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42) # Added random_state for reproducibility
rf_clf.fit(x_train_scaled, y_train)
y_pred_rf = rf_clf.predict(x_test_scaled)
print("Accuracy of Random Forest Classifier:", metrics.accuracy_score(y_pred_rf, y_test))

# Gaussian Naive Bayes
nb_clf = GaussianNB()
nb_clf.fit(x_train_scaled, y_train)
y_pred_nb = nb_clf.predict(x_test_scaled)
print("Accuracy of Gaussian Naive Bayes:", metrics.accuracy_score(y_pred_nb, y_test))

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(x_train_scaled, y_train)
y_pred_dt = dt_clf.predict(x_test_scaled)
print("Accuracy of Decision Tree Classifier:", metrics.accuracy_score(y_pred_dt, y_test))

# K-Neighbors Classifier
kn_clf = KNeighborsClassifier()
kn_clf.fit(x_train_scaled, y_train)
y_pred_kn = kn_clf.predict(x_test_scaled)
print("Accuracy of K-Neighbors Classifier:", metrics.accuracy_score(y_pred_kn, y_test))

print("\n--- End of Model Training and Evaluation ---\n")

# --- Interactive Prediction Section ---

print("--- Loan Approval Prediction for Custom Values ---")
print("Please enter the details for the new loan applicant:")

# Define the expected features and their valid inputs
# These must match the order and type of features in `feature_cols` used for training
# 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
# 'LoanAmount', 'Credit_History', 'Loan_Amount_Term', 'loanAmount_log', 'TotalIncome_log'

# Get user input for each feature
try:
    gender = input("Gender (Male/Female): ").strip().capitalize()
    while gender not in ['Male', 'Female']:
        print("Invalid input. Please enter 'Male' or 'Female'.")
        gender = input("Gender (Male/Female): ").strip().capitalize()

    married = input("Married (Yes/No): ").strip().capitalize()
    while married not in ['Yes', 'No']:
        print("Invalid input. Please enter 'Yes' or 'No'.")
        married = input("Married (Yes/No): ").strip().capitalize()

    dependents = input("Number of Dependents (0, 1, 2, 3+): ").strip()
    while dependents not in ['0', '1', '2', '3+']:
        print("Invalid input. Please enter '0', '1', '2', or '3+'.")
        dependents = input("Number of Dependents (0, 1, 2, 3+): ").strip()

    education = input("Education (Graduate/Not Graduate): ").strip().capitalize()
    while education not in ['Graduate', 'Not Graduate']:
        print("Invalid input. Please enter 'Graduate' or 'Not Graduate'.")
        education = input("Education (Graduate/Not Graduate): ").strip().capitalize()

    self_employed = input("Self Employed (Yes/No): ").strip().capitalize()
    while self_employed not in ['Yes', 'No']:
        print("Invalid input. Please enter 'Yes' or 'No'.")
        self_employed = input("Self Employed (Yes/No): ").strip().capitalize()

    applicant_income = float(input("Applicant Income: "))
    coapplicant_income = float(input("Coapplicant Income: "))
    loan_amount = float(input("Loan Amount: "))
    loan_amount_term = float(input("Loan Amount Term (e.g., 360, 180): "))

    credit_history = float(input("Credit History (1.0 for met, 0.0 for not met): "))
    while credit_history not in [0.0, 1.0]:
        print("Invalid input. Please enter 1.0 or 0.0.")
        credit_history = float(input("Credit History (1.0 for met, 0.0 for not met): "))

    # Calculate derived features for the new input
    total_income = applicant_income + coapplicant_income
    loan_amount_log = np.log(loan_amount)
    total_income_log = np.log(total_income)

    # Prepare the input for prediction
    # The order of features must match `feature_cols`
    # 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
    # 'LoanAmount', 'Credit_History', 'Loan_Amount_Term', 'loanAmount_log', 'TotalIncome_log'

    # Create a single row DataFrame for easier processing with LabelEncoders and StandardScaler
    new_data = pd.DataFrame([[gender, married, dependents, education, self_employed,
                              loan_amount, credit_history, loan_amount_term,
                              loan_amount_log, total_income_log]],
                            columns=feature_cols)

    # Apply Label Encoding using the *fitted* encoders
    new_data['Gender'] = le_gender.transform(new_data['Gender'])
    new_data['Married'] = le_married.transform(new_data['Married'])
    new_data['Dependents'] = le_dependents.transform(new_data['Dependents'])
    new_data['Education'] = le_education.transform(new_data['Education'])
    new_data['Self_Employed'] = le_self_employed.transform(new_data['Self_Employed'])

    # Convert to numpy array and scale
    new_data_scaled = ss.transform(new_data.values.astype(float)) # Ensure float type

    # Make prediction using the best model (e.g., Random Forest)
    prediction_encoded = rf_clf.predict(new_data_scaled)

    # Inverse transform the prediction to get original label (Yes/No)
    prediction_label = le_loan_status.inverse_transform(prediction_encoded)

    print(f"\nBased on the input, the loan is predicted to be: {prediction_label[0]}")

except ValueError as e:
    print(f"Error: Invalid input. Please ensure you enter numerical values for income, loan amount, etc. Details: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

--- Model Performance ---
Accuracy of Random Forest Classifier: 0.7967479674796748
Accuracy of Gaussian Naive Bayes: 0.8292682926829268
Accuracy of Decision Tree Classifier: 0.6422764227642277
Accuracy of K-Neighbors Classifier: 0.8211382113821138

--- End of Model Training and Evaluation ---

--- Loan Approval Prediction for Custom Values ---
Please enter the details for the new loan applicant:


KeyboardInterrupt: Interrupted by user