In [2]:
import pandas as pd
from src.preprocessing import train_test_split_data, build_preprocessor

# Load cleaned dataset
df = pd.read_csv("data/credit_clean.csv")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split_data(df)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# Build and fit preprocessor
pre = build_preprocessor(X_train)
X_train_scaled = pre.fit_transform(X_train)
X_test_scaled = pre.transform(X_test)

print("Scaled train shape:", X_train_scaled.shape)


Train shape: (24000, 23) Test shape: (6000, 23)
Scaled train shape: (24000, 23)


In [3]:
import pandas as pd
from src.preprocessing import train_test_split_data, build_preprocessor
from src.feature_selection import make_selector

# Load dataset
df = pd.read_csv("data/credit_clean.csv")
X_train, X_test, y_train, y_test = train_test_split_data(df)

# Preprocess
pre = build_preprocessor(X_train)
X_train_scaled = pre.fit_transform(X_train)
X_test_scaled = pre.transform(X_test)

# Feature selector
selector = make_selector()
selector.fit(X_train_scaled, y_train)

print("Original features:", X_train.shape[1])
print("Selected features:", selector.transform(X_train_scaled).shape[1])


Original features: 23
Selected features: 12


In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline

from src.preprocessing import train_test_split_data, build_preprocessor
from src.feature_selection import make_selector
from src.model_selection import candidate_models, evaluate_models

# Load and split
df = pd.read_csv("data/credit_clean.csv")
X_train, X_test, y_train, y_test = train_test_split_data(df)

# Build pipeline pieces
pre = build_preprocessor(X_train)
selector = make_selector()

# Evaluate baselines via CV (on the raw X/y; we’ll wrap in pipeline per-model)
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score

models = candidate_models()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score)

cv_results = {}
for name, clf in models.items():
    pipe = Pipeline([("pre", pre), ("sel", selector), ("clf", clf)])
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring=scorer, n_jobs=-1)
    cv_results[name] = (scores.mean(), scores.std())

cv_results


{'logreg': (np.float64(0.3614032923647821), np.float64(0.012768982537802153)),
 'rf': (np.float64(0.4722572800038461), np.float64(0.00857807580988941))}