In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample

In [2]:
# Read the CSV file
df = pd.read_csv("C:/Users/HP/Downloads/Loan_default.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/HP/Downloads/Loan_default.csv'

In [None]:
# Encoding categorical features
df['Education'] = df['Education'].replace({'High School': 0, "Bachelor's": 1, "Master's": 2, 'PhD': 3})
df['EmploymentType'] = df['EmploymentType'].replace({'Unemployed': 0, 'Part-time': 1, 'Self-employed': 2, 'Full-time': 3})
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single': 0, 'Divorced': 1, 'Married': 2})
df['HasMortgage'] = df['HasMortgage'].replace({'No': 0, 'Yes': 1})
df['HasDependents'] = df['HasDependents'].replace({'No': 0, 'Yes': 1})
df['LoanPurpose'] = df['LoanPurpose'].replace({'Education': 0, 'Home': 1, 'Auto': 2, 'Business': 3, 'Other': 4})
df['HasCoSigner'] = df['HasCoSigner'].replace({'No': 0, 'Yes': 1})

df = df.drop(columns=["LoanID"])
df = df.drop(columns=["Income"])
df = df.drop(columns=["LoanAmount"])
df = df.drop(columns=["CreditScore"])
df = df.drop(columns=["MonthsEmployed"])
df = df.drop(columns=["NumCreditLines"])
df = df.drop(columns=["InterestRate"])
df = df.drop(columns=["LoanTerm"])
df = df.drop(columns=["DTIRatio"])
df = df.drop(columns=["Education"])
df = df.drop(columns=["EmploymentType"])
df = df.drop(columns=["MaritalStatus"])
df = df.drop(columns=["HasMortgage"])
df = df.drop(columns=["HasDependents"])
df = df.drop(columns=["LoanPurpose"])
df = df.drop(columns=["HasCoSigner"])

In [None]:
# Apply IQR function
def iqr_(df, ft):
    q1 = df[ft].quantile(0.25)
    q3 = df[ft].quantile(0.75)
    iqr = q3 - q1
    ul = q3 + 1.5 * iqr
    ll = q1 - 1.5 * iqr
    df[ft] = np.where(df[ft] > ul, ul, np.where(df[ft] < ll, ll, df[ft]))

for col in df.columns:
    if col != 'Default':
        iqr_(df, col)

In [None]:
# Split features and target variable
X = df.drop(['Default'], axis=1)
y = df['Default']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Define sampling methods
sampling_methods = {
    'Upsampling': RandomOverSampler(random_state=42),
    'Downsampling': RandomUnderSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'SMOTEENN': SMOTEENN(random_state=42),
    'Random Over Sampling': RandomOverSampler(random_state=42),
    'Random Under Sampling': RandomUnderSampler(random_state=42)
}

In [None]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [None]:
# Perform sampling, train models and evaluate them
results = []

In [None]:
for sampling_name, sampler in sampling_methods.items():
    X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)
    
    for model_name, model in models.items():
        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test_scaled)

        # Convert Linear Regression predictions to binary
        if model_name == 'Linear Regression':
            y_pred = (y_pred > 0.5).astype(int)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        results.append({
            'sampling_method': sampling_name,
            'model': model_name,
            'accuracy': acc,
            'precision': prec,
            'recall': recall,
            'f1_score': f1
        })

In [None]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Ask user for preferred sampling method and model
print("Choose a sampling method:")
print(results_df['sampling_method'].unique())
sampling_choice = input("Enter the preferred sampling method: ")

print("Choose a machine learning algorithm:")
print(results_df['model'].unique())
model_choice = input("Enter the preferred machine learning algorithm: ")

In [None]:
# Train and evaluate the selected combination
selected_sampler = sampling_methods[sampling_choice]
X_train_res, y_train_res = selected_sampler.fit_resample(X_train, y_train)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

selected_model = models[model_choice]
selected_model.fit(X_train_res, y_train_res)
y_pred = selected_model.predict(X_test_scaled)

In [None]:
# Print evaluation metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Selected Sampling Method: {sampling_choice}")
print(f"Selected Model: {model_choice}")
print("Accuracy score:", acc)
print("Precision score:", prec)
print("Recall score:", recall)
print("F1 score:", f1)

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cf_matrix)