In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn. metrics import accuracy_score

In [2]:
df = pd.read_csv("loan_approved.csv")

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
X = df.drop(["Loan_Status (Approved)","Loan_ID"],axis =1 )
y = df["Loan_Status (Approved)"]

In [5]:
X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.75,random_state = 24)

In [6]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


In [9]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
],remainder='drop')


In [10]:
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)


In [11]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=15)

X_train_selected = selector.fit_transform(X_train_prep, y_train)
X_test_selected  = selector.transform(X_test_prep)


In [12]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_selected, y_train)


In [13]:
y_pred = model.predict(X_test_selected)


In [14]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.8506493506493507


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_classif)),
    ("model", LogisticRegression(max_iter=1000))
])


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

param_grid = [
    {
        "model": [LogisticRegression(max_iter=1000)],
        "feature_selection__k": [10, 15],
        "model__C": [0.1, 1, 10]
    },
    {
        "model": [RandomForestClassifier()],
        "feature_selection__k": [10, 15],
        "model__n_estimators": [100, 200],
        "model__max_depth": [5, 10]
    }
]


In [17]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)


In [18]:
print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


Best Parameters: {'feature_selection__k': 10, 'model': LogisticRegression(max_iter=1000), 'model__C': 0.1}
Best CV Accuracy: 0.7978260869565217


In [19]:
from sklearn.metrics import accuracy_score

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))


Test Accuracy: 0.8441558441558441


In [20]:
from sklearn.metrics import accuracy_score

# Train accuracy
y_train_pred = best_model.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)

# Test accuracy
y_test_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)

print("Train Accuracy:", train_acc)
print("Test Accuracy :", test_acc)


Train Accuracy: 0.7978260869565217
Test Accuracy : 0.8441558441558441


In [21]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

train_accuracies = []
test_accuracies = []

for train_index, test_index in kf.split(X):
    X_train_k, X_test_k = X.iloc[train_index], X.iloc[test_index]
    y_train_k, y_test_k = y.iloc[train_index], y.iloc[test_index]

    best_model.fit(X_train_k, y_train_k)

    y_train_pred = best_model.predict(X_train_k)
    y_test_pred  = best_model.predict(X_test_k)

    train_accuracies.append(accuracy_score(y_train_k, y_train_pred))
    test_accuracies.append(accuracy_score(y_test_k, y_test_pred))


In [22]:
print("Train Accuracies:", train_accuracies)
print("Test Accuracies :", test_accuracies)

print("\nMean Train Accuracy:", np.mean(train_accuracies))
print("Mean Test Accuracy :", np.mean(test_accuracies))

print("\nTrain Accuracy Variance:", np.var(train_accuracies))
print("Test Accuracy Variance :", np.var(test_accuracies))


Train Accuracies: [0.8187372708757638, 0.8105906313645621, 0.8044806517311609, 0.8167006109979633, 0.8109756097560976]
Test Accuracies : [0.7886178861788617, 0.8048780487804879, 0.8455284552845529, 0.7886178861788617, 0.819672131147541]

Mean Train Accuracy: 0.8122969549451096
Mean Test Accuracy : 0.809462881514061

Train Accuracy Variance: 2.5324389024183036e-05
Test Accuracy Variance : 0.0004590005483434611


In [23]:
import pickle


with open("model.pkl", "wb") as file:
    pickle.dump(best_model, file)

print("Model saved successfully as model.pkl")


Model saved successfully as model.pkl


In [24]:
import sklearn
print(sklearn.__version__)


1.6.1
