# Week 6 
This is a practise notebook created during Week-6 of ML Zoomcamp.

***

# Download dataset

In [1]:
PATH_DATASET="https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv"

In [2]:
!wget $PATH_DATASET -O ../data/06_data_creditscoring.csv

--2022-10-24 14:57:34--  https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CreditScoring.csv [following]
--2022-10-24 14:57:34--  https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: ‘../data/06_data_creditscoring.csv’


2022-10-24 14:57:35 (5.78 MB/s) - ‘../data/06_data_creditscoring.csv’ saved [182489/182489]



In [3]:
!head ../data/06_data_creditscoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


***

# Import modules

In [4]:
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [17]:
import sys 
sys.path.append('../src/')

import utilities as utils

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text, export_graphviz
from sklearn.ensemble import RandomForestClassifier


from IPython.display import display, Image
import graphviz

***

# Read using Pandas

In [7]:
df = pd.read_csv('../data/06_data_creditscoring.csv')
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


***

# Data Prep and Cleaning

## Standardize column names

In [8]:
df.columns = df.columns.str.lower()
df.columns

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

## Map column data 

In [9]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

In [10]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [11]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [12]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


## Replace 99999999 with NaNs

In [13]:
# Replace 99999999.0 as NaNs
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

In [14]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


## Drop Status='Unk' rows

In [15]:
# Drop 'Unk' status rows
df = df[df['status'] != 'unk'].reset_index(drop=True)
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4454.0,4454.0,4454.0,4454.0,4420.0,4407.0,4436.0,4454.0,4454.0
mean,8.0,46.0,37.0,56.0,131.0,5404.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11574.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1117.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


***

## Prepare dataset

In [16]:
# Test-Train split
df_train_full, df_test = uitls.train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = utils.train_test_split(df_train_full, test_size=0.25, random_state=11)

NameError: name 'uitls' is not defined

In [None]:
# Convert status - categorical into numerical
df_train_full.status = (df_train_full.status == 'default').astype('int').values
df_train.status = (df_train.status == 'default').astype('int').values
df_val.status   = (df_val.status == 'default').astype('int').values
df_test.status  = (df_test.status == 'default').astype('int').values

In [None]:
# Prepare dataset for modeling- features and target
df_train_full, y_train_full = utils.dataset_into_features_and_target(df_train_full, 'status')
df_train, y_train = utils.dataset_into_features_and_target(df_train, 'status')
df_val, y_val     = utils.dataset_into_features_and_target(df_val, 'status')
df_test, y_test   = utils.dataset_into_features_and_target(df_test, 'status')

In [None]:
df_train

***

# Decision trees

In [None]:
train_dicts = df_train.fillna(0.0).to_dict(orient='records')
val_dicts   = df_val.fillna(0.0).to_dict(orient='records')

## Dict Vectorizer

In [None]:
dv = DictVectorizer(sparse=False)

In [None]:
# Apply dv on train dataset
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

## Train Decision Tree

In [None]:
# Train DT
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

## On Validation Dataset

In [None]:
y_pred = dt.predict_proba(X_train)[:,1]
auc = roc_auc_score(y_train, y_pred)
print(f'Test={auc}')

y_pred = dt.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
print(f'Val={auc}')

## View Dt trained

In [None]:
print(export_text(dt, feature_names=dv.get_feature_names()))

In [None]:
export_graphviz(dt, out_file='dt.txt', feature_names=dv.get_feature_names())

***

# Decision tree learning algorithm

In [None]:
depths = [1, 2, 3, 4, 5, 6, 10, 15, 20, 50]

for depth in depths:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    
    y_pred = dt.predict_proba(X_val)[:,1]
    auc = utils.roc_auc_score(y_val, y_pred)
    
    print(f"Depth={depth:.1f} AUC={auc:.4f}")


In [None]:

list_auc = []
scores = [1, 5, 10, 15, 20, 100, 200, 500]
depths = [4, 5, 6]

for score in scores:
    for depth in depths:
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=score)
        dt.fit(X_train, y_train)

        y_pred = dt.predict_proba(X_val)[:,1]
        auc = utils.roc_auc_score(y_val, y_pred)
        
        list_auc.append((depth, score, auc))

        print(f"Depth={depth:.1f} Score={score:.1f} AUC={auc:.4f}")


In [None]:
columns = ['max_depth','min_samples_leaf','auc']

df_aucs = pd.DataFrame(list_auc, columns=columns)
df_aucs

In [None]:
# Pivot 
df_aucs_pivot = df_aucs.pivot(index='min_samples_leaf',
                              columns=['max_depth'],
                              values=['auc'])

df_aucs_pivot

## Heatmap of paramters-auc of DT

In [None]:
# Heatmap 
sns_plot = sns.heatmap(df_aucs_pivot, annot=True, fmt='.3f')
plt.savefig('../images/6_dt_heatmap.png')

In [None]:
# Training DT with selected parametsr
dt = DecisionTreeClassifier(max_depth=6,
                            min_samples_leaf=15)

dt.fit(X_train, y_train)

## Export DT as text 

In [None]:
print(export_text(dt, feature_names=dv.get_feature_names()))

## Export DT as graphviz

In [None]:
# Visualize using GraphViz for dT
export_graphviz(dt, out_file="6_dt.dot",
                    feature_names=dv.get_feature_names(),
                    class_names='status',
                    filled=True, rounded=True, 
                    special_characters=True)


In [None]:
Image("../images/6_dt.png", width=400, height=800)

***

# Random Forest

In [None]:
scores = []

for n in range(10, 201,10):

    rf = RandomForestClassifier(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict_proba(X_val)[:,1]
    
    auc = roc_auc_score(y_val, y_pred)

    scores.append((n, auc))
    
    print(f"n_estimators={n} auc={auc}")

In [None]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'auc'])
df_scores.head()

In [None]:
sns.lineplot(df_scores, x='n_estimators',
                        y='auc')

## Other parameters

In [None]:
scores = []

for n1 in [5, 10, 15]: 
    for n in range(10, 201,10):

        rf = RandomForestClassifier(n_estimators=n, max_depth=n1, random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:,1]

        auc = roc_auc_score(y_val, y_pred)

        scores.append((n1, n, auc))

        print(f"n_estimators={n} auc={auc}")

In [None]:
df_scores = pd.DataFrame(scores, columns=['max_depth', 'n_estimators', 'auc'])
df_scores.head()

In [None]:
for d in [5,10,15]:
    
    df_subset = df_scores[df_scores.max_depth == d]
    
    plt.plot(df_subset.n_estimators,
             df_subset.auc,
             label=f'max_depth={d}')
    
plt.legend()

In [None]:
max_depth = 10

In [None]:
scores = []

for d in [1, 3, 5, 10, 15, 50]: 
    for n in range(10, 201,10):

        rf = RandomForestClassifier(n_estimators=n, 
                                    max_depth=max_depth,
                                    min_samples_leaf=d,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:,1]

        auc = roc_auc_score(y_val, y_pred)

        scores.append((d, n, auc))

        print(f"n_estimators={n} auc={auc}")

In [None]:
df_scores = pd.DataFrame(scores, columns=['min_samples_leaf', 'n_estimators', 'auc'])
df_scores.head()

In [None]:
for d in [1, 3, 5, 10, 15, 50]:
    
    df_subset = df_scores[df_scores.min_samples_leaf == d]
    
    plt.plot(df_subset.n_estimators,
             df_subset.auc,
             label=f'min_samples_leaf={d}')
    
plt.legend()

## Final RF model

In [None]:
rf = RandomForestClassifier(n_estimators=n,
                           max_depth=max_depth,
                           min_samples_leaf=3)

rf.fit(X_train, y_train)

y_pred = rf.predict_proba(X_val)[:,1]

auc = roc_auc_score(y_val, y_pred)

auc

***

# XGBoost

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [None]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}


model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [None]:
y_pred = model.predict(dval)

In [None]:
auc = roc_auc_score(y_val, y_pred)
auc

In [None]:
watchlist = [(dtrain, 'train'),(dval, 'val')]

In [None]:
def parse_xgb_ouput(output):
    
    results = []

    for line in output.stdout.strip().split('\n'):

        # split tabs
        num_iter, train_auc, val_auc = line.split('\t')

        # Format 3 values
        num_iter = int(num_iter.strip('[]'))
        train_auc = float(train_auc.strip('train-auc:'))
        val_auc = float(val_auc.strip('val-auc:'))

        results.append((num_iter, train_auc, val_auc))

    df_results = pd.DataFrame(results, columns=['n_iter','train_auc', 'val_auc'])

    return df_results

## For Plotting XGB

In [None]:
scores = {}

In [None]:
%%capture output
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 0.75,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    'eval_metric': 'auc',
    
    'seed': 1,
    'verbosity': 1,
}


model = xgb.train(xgb_params, dtrain, num_boost_round=175,
                  verbose_eval=5,
                  evals=watchlist)

In [None]:
# df_output = parse_xgb_ouput(output)
# df_output.head()

In [None]:
#sns.lineplot(df_output, x='n_iter', y='train_auc', label='train')
#sns.lineplot(df_output, x='n_iter', y='val_auc', label='val')

# Runing XGBoost parametes

Tuning parameters:  
- `eta` = Learning Rate = Size of Step
- `max_depth` = Depth of tree
- `min_child_weight` = How much weight 2nd model has when correcting 1st model

In [None]:
#key = f"eta:{xgb_params['eta']}"
#key = f"max_depth:{xgb_params['max_depth']}"
key = f"min_child_weight:{xgb_params['min_child_weight']}"
scores[key] = parse_xgb_ouput(output)

for key, df_scores in scores.items():
    plt.plot(df_scores.n_iter, df_scores.val_auc, label=f'auc-{key}')
    
plt.legend()
plt.ylim([0.8, 0.85])

***

# Selecting the final model 

In [None]:
# Best DT model
dt = DecisionTreeClassifier(max_depth=6,
                            min_samples_leaf=15)

dt.fit(X_train, y_train)

In [None]:
# BEst RF model
rf = RandomForestClassifier(n_estimators=200,
                           max_depth=10,
                           min_samples_leaf=3,
                           random_state=1)

rf.fit(X_train, y_train)

In [None]:
# Best XGBoost model
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 0.75,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    'eval_metric': 'auc',
    
    'seed': 1,
    'verbosity': 1,
}


model = xgb.train(xgb_params, dtrain, num_boost_round=175,
                  verbose_eval=5,
                  evals=watchlist)

In [None]:
y_pred = dt.predict_proba(X_val)[:,1]
auc_dt = utils.roc_auc_score(y_val, y_pred)

y_pred = rf.predict_proba(X_val)[:,1]
auc_rf = utils.roc_auc_score(y_val, y_pred)

y_pred = model.predict(dval)
auc_xgb = utils.roc_auc_score(y_val, y_pred)

auc_dt, auc_rf, auc_xgb

## Full Train using XGBoost

In [None]:
train_full_dicts = df_train_full.fillna(0.0).to_dict(orient='records')
test_dicts = df_test.fillna(0.0).to_dict(orient='records')

# Apply dv on train dataset
X_train_full = dv.fit_transform(train_full_dicts)
X_test = dv.transform(test_dicts)

features = dv.get_feature_names()
dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [None]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 0.75,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    'eval_metric': 'auc',
    
    'seed': 1,
    'verbosity': 1,
}


model_final = xgb.train(xgb_params, dtrain, num_boost_round=175,
                  verbose_eval=5,
                  evals=watchlist)

In [None]:
y_pred = model.predict(dtest)
auc_gb_final = utils.roc_auc_score(y_test, y_pred)
auc_gb_final