In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# for images and graphs
import pydotplus
from IPython.display import Image



## Loading and Preprocessing Loans Encoded Data

Load the `sba_loans_encoded.csv` in a pandas DataFrame called `df_loans`.

In [2]:
# Loading data
df_loans = pd.read_csv("../Resources/sba_loans_encoded.csv")
df_loans.head()



Unnamed: 0,Year,Month,Amount,Term,Zip,CreateJob,NoEmp,RealEstate,RevLineCr,UrbanRural,...,City_WILLITS,City_WILMINGTON,City_WINDSOR,City_WINNETKA,City_WOODLAND,City_WOODLAND HILLS,City_WRIGHTWOOD,City_Watsonville,City_YORBA LINDA,City_YUBA CITY
0,2001,11,32812,36,92801,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2001,4,30000,56,90505,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,4,30000,36,92103,0,10,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2003,10,50000,36,92108,0,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,7,343000,240,91345,3,65,1,0,2,...,0,0,0,0,0,0,0,0,0,0


Define the features set, by copying the `df_loans` DataFrame and dropping the `Default` column.

In [3]:
# Define features set
X = df_loans.copy()
X.drop("Default", axis=1, inplace=True)
X.head()



Unnamed: 0,Year,Month,Amount,Term,Zip,CreateJob,NoEmp,RealEstate,RevLineCr,UrbanRural,...,City_WILLITS,City_WILMINGTON,City_WINDSOR,City_WINNETKA,City_WOODLAND,City_WOODLAND HILLS,City_WRIGHTWOOD,City_Watsonville,City_YORBA LINDA,City_YUBA CITY
0,2001,11,32812,36,92801,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2001,4,30000,56,90505,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,4,30000,36,92103,0,10,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2003,10,50000,36,92108,0,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,7,343000,240,91345,3,65,1,0,2,...,0,0,0,0,0,0,0,0,0,0


Create the target vector by assigning the values of the `Default` column from the `df_loans` DataFrame.

In [4]:
# Define target vector
target = df_loans["Default"].values.reshape(-1, 1)



Split the data into training and testing sets.

In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, target, random_state=78)



Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()



In [7]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
# Scale the training data
x_train_scaled = X_scaler.transform(X_train)
x_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

Once data is scaled, create a decision tree instance and train it with the training data (`X_train_scaled` and `y_train`).

In [9]:
# Create the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [10]:
# Fit the model
model = model.fit(x_train_scaled, y_train)

## Making Predictions Using the Tree Model

Validate the trained model, by predicting fraudulent loan applications using the testing data (`X_test_scaled`).

In [11]:
# Making predictions using the testing data
predictions = model.predict(x_test_scaled)

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [12]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)


## Analysis Question

Finally, analyze the model's evaluation results and answer the following question.

* Would you trust in this model to deploy a loans application approval solution in a bank?

 * **Your answer here**
 
* Review either the pdf or png visualization of the tree in the resources folder. Are there any branches that are surprising to you? Are there any that seem more like common sense?

 * **Your answer here**

In [13]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,322,22
Actual 1,29,152


Accuracy Score : 0.9028571428571428
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       344
           1       0.87      0.84      0.86       181

    accuracy                           0.90       525
   macro avg       0.90      0.89      0.89       525
weighted avg       0.90      0.90      0.90       525



In [16]:
%brew install graphviz


2. Add GraphViz to the system's PATH environment variable:
     - For Windows:
         - Open the Control Panel and go to System > Advanced system settings.
         - Click on the "Environment Variables" button.
         - In the "System variables" section, find the "Path" variable and click on the "Edit" button.
         - Add the path to the GraphViz bin folder (e.g., `C:\Program Files\Graphviz\bin`) to the list of paths.
         - Click "OK" to save the changes.
     - For macOS and Linux, the GraphViz executable should be automatically added to the system's PATH during installation.

Once you have installed GraphViz and added it to the system's PATH, you can run the code again and it should be able to find the GraphViz executables and generate the visualization of the decision tree.

Here's the updated code:


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 3)