# Assignment 5: Bankruptcy in Taiwan

## Import Libraries

In [None]:
from sklearn.base import ClassifierMixin
from sklearn.pipeline import Pipeline

In [None]:
import gzip
import json
import pickle

import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
import wqet_grader
from imblearn.over_sampling import RandomOverSampler
from ipywidgets import interact
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from teaching_tools.widgets import ConfusionMatrixWidget

## Prepare Data

### Import

In [None]:
# Load data file
with gzip.open("data/taiwan-bankruptcy-data.json.gz", "r") as f:
        taiwan_data = json.load(f)
print(type(taiwan_data))

In [None]:
taiwan_data.keys()

In [None]:
len(taiwan_data["observations"])

In [None]:
taiwan_data["observations"][:1]

In [None]:
# Extract the key names from taiwan data and assign them to the variable taiwan_data_keys.
taiwan_data_keys = taiwan_data.keys()
print(taiwan_data_keys)

In [None]:
# Calculate how many companies are in `taiwan_data` and assign the result to `n_companies`. 
n_companies = len(taiwan_data["observations"])
print(n_companies)

In [None]:
# Calculate the number of features associated with each company and assign the result to `n_features`.
n_features = len(taiwan_data["observations"][0])
print(n_features)

In [None]:
# Create wrangle function
def wrangle(filename):
    
    # Open compressed file, load into dictionary
    with gzip.open(filename, "r") as f:
        data = json.load(f)
        
    # Load dictionary into DataFrame, set index
    df = pd.DataFrame().from_dict(data["observations"]).set_index("id")
    
    return df

In [None]:
df = wrangle("data/taiwan-bankruptcy-data.json.gz")
print("df shape:", df.shape)
df.head()

### Explore

In [None]:
# see if there are any missing values
nans_by_col = pd.Series(df.isna().sum())
print("nans_by_col shape:", nans_by_col.shape)
nans_by_col.head()

In [None]:
# see if data is imbalnce

# Plot class balance
df["bankrupt"].value_counts(normalize = True).plot(
    kind="bar",
    xlabel="Bankrupt",
    ylabel="Frequency",
    title="Class Balance"
)

### Split

In [None]:
target = "bankrupt"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state= 42
)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_train.shape)
print("y_test shape:", y_train.shape)

### Resample

In [None]:
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train,y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

## Build Model

### Iterate

In [None]:
clf = make_pipeline(
    RandomForestClassifier(random_state=42)
)
print(clf)

In [None]:
cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
print(cv_acc_scores)

In [None]:
# Create a dictionary params with the range of hyperparameters that you want to evaluate for your classifier.

params = {
    "randomforestclassifier__n_estimators": range(25,100,25),
    "randomforestclassifier__max_depth": range(10,50,10),

}
params

Tip:</b> If the classifier you built is a predictor only (not a pipeline with multiple steps), you don't need to include the step name in the keys of your <code>params</code> dictionary. For example, if your classifier was only a random forest (not a pipeline containing a random forest), your would access the number of estimators using <code>"n_estimators"</code>, not <code>"randomforestclassifier__n_estimators"</code>.</p>

In [None]:
model = GridSearchCV(
    clf,
    param_grid = params,
    cv = 5,
    n_jobs = -1,
    verbose= 1
)
model

In [None]:
model.fit(X_train_over, y_train_over)

In [None]:
cv_results = pd.DataFrame(model.cv_results_)
cv_results.head(5)

In [None]:
best_params = model.best_params_
print(best_params)

### Evaluate 

In [None]:
acc_train = model.score(X_train_over,y_train_over)
acc_test = model.score(X_test, y_test)

print("Training Accuracy:", round(acc_train, 4))
print("Test Accuracy:", round(acc_test, 4))

In [None]:
# Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test);

In [None]:
class_report = classification_report(y_test, model.predict(X_test))
print(class_report)

## Communicate

In [None]:
# Get feature names from training data
features = X_train_over.columns

# Extract importances from model
importances = model.best_estimator_.named_steps["randomforestclassifier"].feature_importances_
# Create a series with feature names and importances
feat_imp = pd.Series(importances, index=features).sort_values()
# Plot 10 most important features
feat_imp.tail(10).plot(kind = "barh")
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");

Open the file <code>my_predictor_assignment.py</code>. Add your `wrangle` function, and then create a `make_predictions` function that takes two arguments: `data_filepath` and <code>model_filepath</code>. Use the cell below to test your module. 

In [None]:
# Import your module
from my_predictor_assignment import make_predictions

# Generate predictions
y_test_pred = make_predictions(
    data_filepath="data/taiwan-bankruptcy-data-test-features.json.gz",
    model_filepath="model-5-5.pkl",
)

print("predictions shape:", y_test_pred.shape)
y_test_pred.head()