# Baseline Model - Logistic Regression
This notebook creates a baseline logistic regression model for loan approval prediction.

In [10]:

import os
import sys


import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
#Load the cleaned data
import os
# Get the project root directory (parent of notebooks)
project_root = os.path.dirname(os.getcwd())
data_path = os.path.join(project_root, 'data', 'processed', 'cleaned.csv')
df = pd.read_csv(data_path)
X, y = df.drop('loan_approved', axis=1), df['loan_approved']
print(f"Dataset shape: {df.shape}")
print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"Target distribution:")
print(y.value_counts())

Dataset shape: (2000, 7)
Features: (2000, 6)
Target: (2000,)
Target distribution:
loan_approved
False    1121
True      879
Name: count, dtype: int64


In [12]:
#Check feature types
print("Feature data types:")
print(X.dtypes)
print(f"\nCategorical columns: {X.select_dtypes(include=['object']).columns.tolist()}")
print(f"Numerical columns: {X.select_dtypes(include=['int64', 'float64']).columns.tolist()}")

Feature data types:
city               object
income              int64
credit_score        int64
loan_amount         int64
years_employed      int64
points            float64
dtype: object

Categorical columns: ['city']
Numerical columns: ['income', 'credit_score', 'loan_amount', 'years_employed', 'points']


In [13]:
#Baseline model - Logistic Regression with preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

#Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

# Create pipeline with preprocessing and classifier
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])

pipe.fit(X, y)
pred = pipe.predict(X)

print(f"Accuracy: {accuracy_score(y, pred)}")
print(f"F1: {f1_score(y, pred)}")

Accuracy: 1.0
F1: 1.0


In [14]:
#Save the model
model_path = os.path.join(project_root, 'models', 'baseline.pkl')
joblib.dump(pipe, model_path)
print(f"Baseline model saved to {model_path}")

Baseline model saved to m:\projects\repos\loan-approval-project-mlfinal\models\baseline.pkl
