In [11]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
!pip install graphviz
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from graphviz import Source
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
!pip install pydot
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder



In [2]:
DATA_DIR = "/home/jovyan/work"
TRAIN_FILE = f"{DATA_DIR}/german_credit_data.csv"
TEST_FILE = f"{DATA_DIR}/german_test_data.csv"
MODIFIED_TRAIN_FILE = f"{DATA_DIR}/german_credit_data_modified.csv"
MODIFIED_TEST_FILE = f"{DATA_DIR}/german_test_data_modified.csv"
DOT_FILE = f"{DATA_DIR}/loan_risk.dot"
DOT_FILE_MODIFIED = f"{DATA_DIR}/loan_risk_modified.dot"
DOT_FILE_MODIFIED_INSTALLMENT = f"{DATA_DIR}/loan_risk_modified_installment.dot"

In [3]:
def load_data(file_path):
    return pd.read_csv(file_path)

def save_data(df, file_path):
    df.to_csv(file_path, index=False)

In [35]:
train_df = load_data(TRAIN_FILE)
test_df = load_data(TEST_FILE)

In [36]:
 X_train, x_test = train_test_split(train_df, test_size=0.25, random_state=42)

In [37]:
X_train

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
82,24,female,1,rent,moderate,,1568,18,business,good
991,34,male,1,own,moderate,,1569,15,radio/TV,good
789,27,male,2,own,little,little,5998,40,education,bad
894,29,male,2,own,,,1169,18,radio/TV,good
398,46,male,2,rent,little,moderate,1223,12,car,bad
...,...,...,...,...,...,...,...,...,...,...
106,39,male,3,own,little,,6458,18,car,bad
270,32,male,2,own,,,2662,18,car,good
860,27,male,2,own,rich,,5804,24,car,good
435,25,male,2,own,,moderate,1484,12,radio/TV,bad


In [38]:
x_test

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
521,24,female,2,own,little,little,3190,18,radio/TV,bad
737,35,male,1,own,moderate,little,4380,18,car,good
740,32,male,2,own,moderate,little,2325,24,car,good
660,23,male,2,rent,little,rich,1297,12,radio/TV,good
411,35,male,3,own,little,,7253,33,car,good
...,...,...,...,...,...,...,...,...,...,...
109,35,male,2,own,quite rich,moderate,1410,14,business,good
430,74,male,1,own,little,,3448,5,business,good
77,51,male,2,own,little,moderate,4771,11,radio/TV,good
84,52,male,1,own,little,little,2315,10,radio/TV,good


In [41]:
ohenc = OneHotEncoder(sparse_output=False)
cat_vars = X_train.select_dtypes(include=['category','object']).columns.tolist()
X_train[cat_vars]

Unnamed: 0,Sex,Housing,Saving accounts,Checking account,Purpose,Risk
82,female,rent,moderate,,business,good
991,male,own,moderate,,radio/TV,good
789,male,own,little,little,education,bad
894,male,own,,,radio/TV,good
398,male,rent,little,moderate,car,bad
...,...,...,...,...,...,...
106,male,own,little,,car,bad
270,male,own,,,car,good
860,male,own,rich,,car,good
435,male,own,,moderate,radio/TV,bad


In [42]:
Loans_train_prepared = ohenc.fit_transform(X_train[cat_vars])

pd.DataFrame(Loans_train_prepared, 
             columns=ohenc.get_feature_names_out(cat_vars)).head(5)

Unnamed: 0,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Saving accounts_little,Saving accounts_moderate,Saving accounts_quite rich,Saving accounts_rich,Saving accounts_nan,...,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
