In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib
import dill
from datetime import datetime

In [13]:
path = '../data/processed/2024-12-03_processed_data.csv'
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227170 entries, 0 to 227169
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan status           227170 non-null  int64  
 1   annual income         227170 non-null  float64
 2   employment length     227170 non-null  object 
 3   loan amount           227170 non-null  int64  
 4   income verification   227170 non-null  object 
 5   intrest rate          227170 non-null  float64
 6   term                  227170 non-null  object 
 7   purpose               227170 non-null  object 
 8   housing type          227170 non-null  object 
 9   debt to income ratio  227170 non-null  float64
 10  credit availability   227170 non-null  int64  
 11  delayed payments      227170 non-null  object 
 12  pending registration  227170 non-null  object 
 13  credit application    227170 non-null  object 
dtypes: float64(3), int64(3), object(8)
memory usage: 24.

In [14]:
# Numerical features (int64 and float64)
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categorical features (object)
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)


Numerical Features: ['loan status', 'annual income', 'loan amount', 'intrest rate', 'debt to income ratio', 'credit availability']
Categorical Features: ['employment length', 'income verification', 'term', 'purpose', 'housing type', 'delayed payments', 'pending registration', 'credit application']


In [15]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='passthrough'
)

In [17]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [18]:
# Separate features and target
X = df.drop(columns=['loan status'])
y = df['loan status']

y.head()

0    1
1    0
2    1
3    1
4    1
Name: loan status, dtype: int64

In [20]:
pipeline.fit(X)
X_transformed = pipeline.transform(X)