In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense



In [2]:
df = pd.read_csv(r'Loan_approval_data_2025.csv')
df

Unnamed: 0,customer_id,age,occupation_status,years_employed,annual_income,credit_score,credit_history_years,savings_assets,current_debt,defaults_on_file,delinquencies_last_2yrs,derogatory_marks,product_type,loan_intent,loan_amount,interest_rate,debt_to_income_ratio,loan_to_income_ratio,payment_to_income_ratio,loan_status
0,CUST100000,40,Employed,17.2,25579,692,5.3,895,10820,0,0,0,Credit Card,Business,600,17.02,0.423,0.023,0.008,1
1,CUST100001,33,Employed,7.3,43087,627,3.5,169,16550,0,1,0,Personal Loan,Home Improvement,53300,14.10,0.384,1.237,0.412,0
2,CUST100002,42,Student,1.1,20840,689,8.4,17,7852,0,0,0,Credit Card,Debt Consolidation,2100,18.33,0.377,0.101,0.034,1
3,CUST100003,53,Student,0.5,29147,692,9.8,1480,11603,0,1,0,Credit Card,Business,2900,18.74,0.398,0.099,0.033,1
4,CUST100004,32,Employed,12.5,63657,630,7.2,209,12424,0,0,0,Personal Loan,Education,99600,13.92,0.195,1.565,0.522,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,CUST149995,35,Employed,4.3,39449,570,16.3,1127,7576,0,0,0,Credit Card,Education,42800,21.31,0.192,1.085,0.362,0
49996,CUST149996,34,Employed,4.4,20496,672,12.6,1478,6276,1,0,0,Credit Card,Personal,3800,18.07,0.306,0.185,0.062,0
49997,CUST149997,41,Self-Employed,4.8,18743,719,10.1,17,10331,0,0,0,Credit Card,Personal,18000,17.45,0.551,0.960,0.320,0
49998,CUST149998,38,Student,0.4,17250,633,1.3,5,7779,0,0,1,Personal Loan,Personal,1400,14.71,0.451,0.081,0.027,0


In [3]:
df.drop(columns = ['customer_id'],inplace = True)


In [4]:
objcols = df.select_dtypes(include='object').columns
numcols = [col for col in df.columns if col not in objcols]

In [5]:
print(objcols)
print(numcols)

Index(['occupation_status', 'product_type', 'loan_intent'], dtype='object')
['age', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio', 'loan_status']


In [6]:
for col in objcols:
    print(f"Col: {col} Count: {df[col].unique()}")

Col: occupation_status Count: ['Employed' 'Student' 'Self-Employed']
Col: product_type Count: ['Credit Card' 'Personal Loan' 'Line of Credit']
Col: loan_intent Count: ['Business' 'Home Improvement' 'Debt Consolidation' 'Education' 'Personal'
 'Medical']


In [7]:
for col in numcols:
    print(f"Col: {col} Count: {df[col].skew()}")

Col: age Count: 0.3358600793243518
Col: years_employed Count: 1.2936126434825892
Col: annual_income Count: 1.8878688896751907
Col: credit_score Count: 0.012996167920064387
Col: credit_history_years Count: 0.9537551630953544
Col: savings_assets Count: 12.054945856751614
Col: current_debt Count: 2.437836894846607
Col: defaults_on_file Count: 3.9693860954550284
Col: delinquencies_last_2yrs Count: 1.8169443293741545
Col: derogatory_marks Count: 3.1175467460373585
Col: loan_amount Count: 0.9314923184493302
Col: interest_rate Count: 0.019608003856324267
Col: debt_to_income_ratio Count: 0.5914931983742732
Col: loan_to_income_ratio Count: 0.6575265420549187
Col: payment_to_income_ratio Count: 0.6573849299881692
Col: loan_status Count: -0.20288185984903798


In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df = pd.get_dummies(df)

In [10]:
X = df.drop(columns=['loan_status'])
Y = df['loan_status']


In [11]:
st = StandardScaler()
X_Scaled = st.fit_transform(X)


In [12]:
x_train,x_test,y_train,ytest = train_test_split(X_Scaled,Y,test_size=0.2,random_state =42,stratify = Y)

In [13]:
pca = PCA(n_components =0.80)
xpca_train = pca.fit_transform(x_train)
xpca_test = pca.transform(x_test)


In [14]:
LR = LogisticRegression()
SVM = SVC()
NB = GaussianNB()
KNN = KNeighborsClassifier()
DT = DecisionTreeClassifier()
ADB = AdaBoostClassifier()
VC = VotingClassifier(estimators = [('lr',LR),('nb',NB),('knn',KNN),('dt',DT)],voting = 'soft')

In [15]:
LR.fit(xpca_train,y_train)
SVM.fit(xpca_train,y_train)
NB.fit(xpca_train,y_train)
KNN.fit(xpca_train,y_train)
DT.fit(xpca_train,y_train)
ADB.fit(xpca_train,y_train)


0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,


In [17]:
models = {
    "Logistic Regression": LR,
    "SVM": SVM,
    "Naive Bayes": NB,
    "KNN": KNN,
    "Decision Tree": DT,
    "AdaBoost": ADB
}

for name, model in models.items():
      
    # Predict
    y_pred_train = model.predict(xpca_train)
    y_pred_test  = model.predict(xpca_test)
    
    # Accuracy
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc  = accuracy_score(ytest, y_pred_test)
    
    print("====================================")
    print(f"Model: {name}")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Testing Accuracy : {test_acc:.4f}")


Model: Logistic Regression
Training Accuracy: 0.8543
Testing Accuracy : 0.8555
Model: SVM
Training Accuracy: 0.8965
Testing Accuracy : 0.8932
Model: Naive Bayes
Training Accuracy: 0.8030
Testing Accuracy : 0.8018
Model: KNN
Training Accuracy: 0.9103
Testing Accuracy : 0.8633
Model: Decision Tree
Training Accuracy: 1.0000
Testing Accuracy : 0.8171
Model: AdaBoost
Training Accuracy: 0.8418
Testing Accuracy : 0.8373


<!-- model = Sequential()
model.add(Dense(units=64, activation = 'relu')) -->


In [29]:
model = Sequential()
model.add(Dense(units=64, input_dim=xpca_train.shape[1], activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation = 'sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer = 'adam', metrics = ['accuracy'])


In [30]:
model.fit(xpca_train,y_train,epochs=50,batch_size=32,validation_data= (xpca_test,ytest))

Epoch 1/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8660 - loss: 0.2974 - val_accuracy: 0.8798 - val_loss: 0.2721
Epoch 2/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8809 - loss: 0.2645 - val_accuracy: 0.8846 - val_loss: 0.2650
Epoch 3/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8838 - loss: 0.2588 - val_accuracy: 0.8864 - val_loss: 0.2603
Epoch 4/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8860 - loss: 0.2542 - val_accuracy: 0.8851 - val_loss: 0.2592
Epoch 5/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8876 - loss: 0.2487 - val_accuracy: 0.8879 - val_loss: 0.2571
Epoch 6/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8914 - loss: 0.2440 - val_accuracy: 0.8877 - val_loss: 0.2531
Epoch 7/50
[1m1

<keras.src.callbacks.history.History at 0x21fa61ca210>

In [34]:
# y_pred_train = model.predict(xpca_train)
# y_pred_trclasses = np.argmax(y_pred_train, axis=1)
# y_pred_test  = model.predict(xpca_test)
# y_pred_teclasses = np.argmax(y_pred_test, axis=1)
# train_acc = accuracy_score(y_train, y_pred_trclasses)
# test_acc  = accuracy_score(ytest, y_pred_teclasses)
loss,acc = model.evaluate(xpca_test,ytest)
print("====================================")
print(f"Model: {name}")
print(f"Training Accuracy: {loss:.4f}")
print(f"Testing Accuracy : {acc:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8914 - loss: 0.2572
Model: AdaBoost
Training Accuracy: 0.2572
Testing Accuracy : 0.8914
