# 02_Logistic_Regression_PD
Logistic regression baseline for PD (Probability of Default).

In [None]:
# Common imports for the project
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%matplotlib inline


In [None]:
from src.preprocessing import basic_cleaning, build_preprocessing_pipeline
from src.feature_engineering import create_features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import joblib

In [None]:
data_path = '../data/merged_data.csv'
df = pd.read_csv(data_path)
df = basic_cleaning(df)
df = create_features(df)
# Ensure target exists
target = 'default'
assert target in df.columns, "Target 'default' not found in dataset."
exclude = [target, 'id', 'index', 'source']
features = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude]
X = df[features].fillna(0)
y = df[target].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train shape:', X_train.shape)

In [None]:
from sklearn.pipeline import Pipeline
preproc = build_preprocessing_pipeline(numeric_features=features, categorical_features=[], scaler=True)
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
pipe = Pipeline([('preproc', preproc), ('clf', clf)])
pipe.fit(X_train, y_train)
probs = pipe.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, probs)
print(f'Logistic Regression AUC: {auc:.4f}')
print(classification_report(y_test, pipe.predict(X_test)))

In [None]:
os.makedirs('../models', exist_ok=True)
joblib.dump(pipe, '../models/logistic.joblib')
print('Saved logistic model to ../models/logistic.joblib')