# Import libraries

In [1]:
import numpy as np
import pandas as pd

# pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# modeling
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import joblib

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Manual Preprocess

In [2]:
RANDOM_STATE=567

loan_data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv")
loan_data.drop(columns=['Unnamed: 0', 'Loan_ID'], inplace=True)
loan_data['Credit_History'] = loan_data['Credit_History'].astype('object')
X = loan_data.drop('Loan_Status', axis=1)
y = loan_data.Loan_Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Pipeline

## ColumnTransformer

In [3]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline(steps=[('num_impute', SimpleImputer(strategy='median')),
                               ('num_scale', StandardScaler())])
cat_pipeline = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                               ('cat_encode', OneHotEncoder(drop='first'))])

preprocess = ColumnTransformer(transformers=[('num', num_pipeline, num_cols),
                                             ('cat', cat_pipeline, cat_cols)])

## Full Pipeline

In [4]:
model = LogisticRegressionCV(Cs=100, cv=10, scoring='f1', solver='liblinear', random_state=RANDOM_STATE)
pipeline = Pipeline(steps=[('preprocess', preprocess),
                           ('model', model)])
pipeline.fit(X_train, y_train)
print(f'F1 Train: {f1_score(y_train, pipeline.predict(X_train))}')
print(f'F1 Test: {f1_score(y_test, pipeline.predict(X_test))}')

F1 Train: 0.8770764119601329
F1 Test: 0.8848484848484849


In [5]:
transformed_cat_cols = pipeline['preprocess'].transformers_[1][1]['cat_encode'].get_feature_names_out(cat_cols).tolist()
transformed_data = pd.DataFrame(preprocess.fit_transform(X_train),
                                columns=num_cols + transformed_cat_cols)
transformed_data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Credit_History_1.0,Property_Area_Semiurban,Property_Area_Urban
0,0.217423,-0.528093,0.196239,0.282064,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-0.215676,0.197619,0.454656,0.282064,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.667552,0.392422,-0.355832,0.282064,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,2.389388,0.179995,-0.672979,0.282064,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
4,-0.361503,-0.528093,0.137508,-4.525287,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


## Save Pipeline

In [6]:
# from google.colab import files
filename = '../models/loan_model_logreg.pkl'
joblib.dump(pipeline, filename)
# files.download(filename)

['../models/loan_model_logreg.pkl']

# Load Pipeline and Predict New Data

In [7]:
gender = 'Male' # levels: ['Male', 'Female']
married = 'No' # levels: ['No', 'Yes']
dependents = '3+' # levels: ['0', '1', '2', '3+']
education = 'Graduate' # levels: ['Not Graduate', 'Graduate']
self_employed = 'No' # levels: ['No', 'Yes']
applicant_income = 1000 # unit: dollar
coapplicant_income = 500 # unit: dollar
loan_amount = 400 # unit: dollar
loan_amount_term = 60 # unit: days
credit_history = 1.0 # levels: ['0', '1'] -> ['Not paid', 'All debts paid']
property_area = 'Urban' # ['Rural', 'Semiurban', 'Urban']

new_data = pd.DataFrame([{
    'Gender': gender,
    'Married': married,
    'Dependents': dependents,
    'Education': education,
    'Self_Employed': self_employed,
    'ApplicantIncome': applicant_income,
    'CoapplicantIncome': coapplicant_income,
    'LoanAmount': loan_amount,
    'Loan_Amount_Term': loan_amount_term,
    'Credit_History': credit_history,
    'Property_Area': property_area
}])

new_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,3+,Graduate,No,1000,500,400,60,1.0,Urban


In [8]:
pipeline_load = joblib.load(filename)
pipeline_load.predict_proba(new_data)

array([[0.35667516, 0.64332484]])