In [None]:
import pandas as pd

from sklearn import tree
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

import graphviz 

from IPython.core.display import HTML
import util



display(HTML("<style>pre { white-space: pre !important; }</style>")),
util.set_default_pandas_options()

In [None]:
df = pd.read_csv('../data/loan-risk.csv', index_col=0)
df

In [None]:
# one hot encoding or one-of-K scheme
df['CreditRatingLow'] = 0
df['CreditRatingModerate'] = 0
df['CreditRatingHigh'] = 0

for i in range(len(df)):
    if df.loc[i,'CreditRating'] == 'Low':
        df.loc[i,'CreditRatingLow'] = 1
    elif df.loc[i,'CreditRating'] == 'Moderate':
        df.loc[i,'CreditRatingModerate'] = 1
    elif df.loc[i,'CreditRating'] == 'High':
        df.loc[i,'CreditRatingHigh'] = 1    

df

In [None]:
# alternative approach of performing one hot encoding in Pandas
pd.get_dummies(df['CreditRating'], prefix='CreditRating')

In [None]:
# alternative approach of performing one hot encoding in Scikit Learn
ohe = OneHotEncoder()
ohe.fit_transform(df['CreditRating'].values.reshape(-1, 1)).toarray()

In [None]:
df = df.drop('CreditRating', axis=1)
df

In [None]:
independent_variables = df.drop('LoanRisk', axis=1)

x = independent_variables.values
y = df['LoanRisk'].values

clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best')
clf = clf.fit(x, y)
y_pred = clf.predict(x)

print('Accuracy = {}'.format(metrics.accuracy_score(y, y_pred)))

In [None]:
print('Confusion = \n{}'.format(metrics.confusion_matrix(y, y_pred, labels=['Low','High'])))

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=['Income','CreditRatingLow','CreditRatingModerate','CreditRatingHigh'], 
                                class_names=['High','Low'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("../data/loan-risk")