In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset
url = "https://raw.githubusercontent.com/dsrscientist/DSData/master/loan_prediction.csv"
data = pd.read_csv(url)

# Data preprocessing
# Drop the loan ID column as it's not needed for prediction
data = data.drop(['Loan_ID'], axis=1)

# Handle missing values
data.fillna(method='ffill', inplace=True)  # Forward fill missing values

# Encode categorical variables using label encoding and one-hot encoding
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Split the data into features and target
X = data.drop(['Loan_Status'], axis=1)
y_loan_status = data['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_loan_status, test_size=0.2, random_state=42)

# Build and train the model for loan status prediction using HistGradientBoostingClassifier
loan_status_model = HistGradientBoostingClassifier(random_state=42)
loan_status_model.fit(X_train, y_train)
loan_status_predictions = loan_status_model.predict(X_test)
loan_status_accuracy = accuracy_score(y_test, loan_status_predictions)

print("Loan Status Prediction Accuracy:", loan_status_accuracy)


Loan Status Prediction Accuracy: 0.8048780487804879
