In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Perceptron
from scipy.stats import uniform, randint

# Load the dataset
df = pd.read_csv(r"C:\Users\DELL\OneDrive\Desktop\ML\archive\loan-train.csv")

# Data preprocessing
# Replace missing categorical values with the mode
categorical_features = ['Sentiment', 'User', 'Platform']
for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

# Convert categorical variables into numerical format using one-hot encoding
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df[categorical_features]).toarray()

# Combine encoded data with numerical features
X = np.concatenate([encoded_data, df[numerical_features]], axis=1)
y = df['Loan_Status']

# Define perceptron model
perceptron = Perceptron()

# Define hyperparameters for RandomizedSearchCV
param_dist_perceptron = {
    'alpha': uniform(0.0001, 0.1),
    'max_iter': randint(100, 1000),
}

# Perform RandomizedSearchCV
random_search_perceptron = RandomizedSearchCV(perceptron, param_distributions=param_dist_perceptron, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search_perceptron.fit(X, y)

# Print best parameters
print("Best Parameters for Perceptron:")
print(random_search_perceptron.best_params_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters for Perceptron:
{'alpha': 0.037554011884736255, 'max_iter': 960}


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

# Load the dataset
df = pd.read_csv(r"C:\Users\DELL\OneDrive\Desktop\ML\archive\loan-train.csv")

# Data preprocessing
# Replace missing numerical values with the median
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for feature in numerical_features:
    df[feature].fillna(df[feature].median(), inplace=True)

# Replace missing categorical values with the mode
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Credit_History']
for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

# Encode target variable 'Loan_Status' to numeric
label_encoder = LabelEncoder()
df['Loan_Status'] = label_encoder.fit_transform(df['Loan_Status'])

# Convert categorical variables into numerical format using one-hot encoding
encoded_data = pd.get_dummies(df[categorical_features], drop_first=True)

# Combine encoded data with numerical features
X = pd.concat([encoded_data, df[numerical_features]], axis=1)
y = df['Loan_Status']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers
classifiers = {
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'CatBoost': CatBoostClassifier(logging_level='Silent'),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'Naïve Bayes': GaussianNB()
}

# Function to evaluate classifier using cross-validation
def evaluate_classifier(classifier, X, y):
    accuracy = cross_val_score(classifier, X, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(classifier, X, y, cv=5, scoring='precision').mean()
    recall = cross_val_score(classifier, X, y, cv=5, scoring='recall').mean()
    f1 = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
    return accuracy, precision, recall, f1

# Evaluate each classifier
results = {}
for clf_name, clf in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_classifier(clf, X_train, y_train)
    results[clf_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Display results in a tabular format
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)


                        Accuracy  Precision    Recall  F1 Score
Support Vector Machine  0.692476   0.696895  0.988321  0.817391
Decision Tree           0.688250   0.778146  0.777579  0.774801
Random Forest           0.794249   0.808002  0.932694  0.865827
AdaBoost                0.773882   0.798799  0.903495  0.847257
CatBoost                0.810513   0.804835  0.961935  0.876144
XGBoost                 0.771779   0.804233  0.888704  0.844163
Naïve Bayes             0.794228   0.802529  0.935635  0.863739
