In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# First, let's load the data
data = pd.read_csv('./data/train.csv')  # Assuming you'll save the data to a CSV

# Data Preprocessing Steps:

# 1. Handle Categorical Variables
categorical_features = [
    'person_home_ownership', 
    'loan_intent', 
    'loan_grade', 
    'cb_person_default_on_file'
]

# 2. Handle Numerical Variables
numerical_features = [
    'person_age', 
    'person_income', 
    'person_emp_length', 
    'loan_amnt', 
    'loan_int_rate', 
    'loan_percent_income', 
    'cb_person_cred_hist_length'
]

# 3. Create Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 4. Prepare Features and Target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# 5. Split the Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Create Full Pipeline (example with logistic regression)
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# 7. Fit the Model
model_pipeline.fit(X_train, y_train)

# 8. Evaluate the Model
print("Model Accuracy:", model_pipeline.score(X_test, y_test))

Model Accuracy: 0.9115866655298832
