In [1]:
import pandas as pd

dataset = pd.read_csv('cityu10c_train_dataset.csv')
dataset.head()

Unnamed: 0,ID,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,...,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved
0,1,2018-01-01,45,39948,617,Employed,Master,22,13152,48,...,19183,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0
1,2,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,...,9595,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0
2,3,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,...,128874,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0
3,4,2018-01-04,58,69084,545,Employed,High School,34,37898,96,...,5370,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0
4,5,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,...,17286,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1


In [2]:
features = ['Age', 'AnnualIncome', 'CreditScore', 'EmploymentStatus', 'EducationLevel', 'LoanAmount', 'LoanDuration']
target = ['LoanApproved']

X = dataset[features]
y = dataset[target]

X.head()

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration
0,45,39948,617,Employed,Master,13152,48
1,38,39709,628,Employed,Associate,26045,48
2,47,40724,570,Employed,Bachelor,17627,36
3,58,69084,545,Employed,High School,37898,96
4,37,103264,594,Employed,Associate,9184,36


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
import pickle

# Assuming you have X and y defined as in your previous code snippets

# Define categorical and numerical features
categorical_features = ['EmploymentStatus', 'EducationLevel']
numerical_features = ['Age', 'AnnualIncome', 'CreditScore', 'LoanAmount', 'LoanDuration']

# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')) # Using OneHotEncoder for categorical features
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Fit the pipeline
pipeline.fit(X, y.values.ravel()) # Use ravel() to avoid DataConversionWarning

# Save the pipeline to a pickle file
with open('decision_tree_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Pipeline trained and saved to decision_tree_pipeline.pkl")

Pipeline trained and saved to decision_tree_pipeline.pkl
