# MSCA 31008 - Data Mining Assignment 4 Part 2 (Group 4)
<b>Qingwei Zhang, Jake Brewer, Prinu Mathew</b><br>
<b>Winter 2023</b>

### Import Libraries  

In [30]:
import sys, os, json, subprocess

## for data
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings("ignore")


In [31]:
## for machine learning
try:
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.tree import DecisionTreeClassifier, export_graphviz
    from sklearn.metrics import (
        confusion_matrix,
        classification_report,
        f1_score,
        roc_auc_score,
        roc_curve,
        accuracy_score,
    )

    import graphviz

    print("~~~ Already installed required packages for machine learning ~~~~")
except Exception as e:
    print(e)

    print("~~~ Installing required packages for machine learning ~~~~")
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "--upgrade", "kneed"]
    )
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "--upgrade", "scikit-learn"]
    )
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.tree import DecisionTreeClassifier, export_graphviz
    from sklearn.metrics import (
        confusion_matrix,
        classification_report,
        f1_score,
        roc_auc_score,
        roc_curve,
        accuracy_score,
    )

    import graphviz


~~~ Already installed required packages for machine learning ~~~~


In [None]:
## for interactive visualization
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    print("~~~ Already installed required packages for interactive visualizations ~~~~")
except Exception as e:
    print(e)
    
    print("~~~ Installing required packages for interactive visualizations ~~~~")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "matplotlib"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "seaborn"])
    import matplotlib.pyplot as plt
    import seaborn as sns
    
%matplotlib inline

## 0. Load and Explore Data (from Part 1)

In [None]:
# import the preprocess data from part 1

df = pd.read_csv("diabetes_data_preprocess.csv")
df.head()


In [None]:
# view data types and number of non-null values in each column

df.info()


## 1. Split Data into Training (70%) and Testing (30%)

In [None]:
# split into train and test using random state for reproducable results
# exact same split as used in part 1

X = df.drop(columns=["readmitted"])
y = df["readmitted"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
y_train.value_counts()


## 2. Perform Cross Validation to find best Hyper-Parameters

In [None]:
# run grid search cross validation to find best regularization hyper-parameters

param_map = {
    "max_depth": [5, 7, 9],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["auto", "sqrt", "log2"],
    "max_leaf_nodes": [8, 10, 12],
}

# specify class_weight='balanced' to account for imbalance in re-admitted and non re-admitted patients
clf = DecisionTreeClassifier(class_weight="balanced", random_state=42)
clf_gs = GridSearchCV(clf, param_grid=param_map, cv=5, n_jobs=-1, verbose=2)
clf_gs.fit(X_train, y_train)


In [None]:
pd.DataFrame(
    clf_gs.cv_results_,
    columns=[
        "rank_test_score",
        "max_depth",
        "param_criterion",
        "param_max_depth",
        "param_max_features",
        "param_max_leaf_nodes",
        "mean_test_score",
        "std_test_score",
    ],
).sort_values(by=["rank_test_score"])


In [None]:
# identify best hyper-parameters

best_criterion = clf_gs.best_params_["criterion"]
best_depth = clf_gs.best_params_["max_depth"]
best_feat = clf_gs.best_params_["max_features"]
best_nodes = clf_gs.best_params_["max_leaf_nodes"]

clf_gs.best_params_


In [None]:
# identify most important features

best_model = clf_gs.best_estimator_
imp_top_features = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": best_model.feature_importances_}
)
imp_top_features = imp_top_features[imp_top_features["Importance"] > 0]
imp_top_features_sorted = imp_top_features.sort_values(by="Importance", ascending=False)
imp_top_features_sorted


## 3. Build Best Model on Training Data

In [None]:
# use top 10 features and best hyper-parameters to fit a best model

imp_top_features = imp_top_features_sorted["Feature"].iloc[
    : len(imp_top_features_sorted)
]
X_train = X_train[imp_top_features]
X_test = X_test[imp_top_features]

clf_trim = DecisionTreeClassifier(
    class_weight="balanced",
    criterion=best_criterion,
    max_depth=best_depth,
    max_features=best_feat,
    max_leaf_nodes=best_nodes,
    random_state=42,
)

clf_trim.fit(X_train, y_train)


In [None]:
# predict on the test set
y_pred = clf_trim.predict(X_test)

# evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after cross validation:", accuracy)


## 4. Generate Confusion Matrix on Training Data

In [None]:
# make predictions for training data

y_pred = clf_trim.predict(X_train)

# create confusion matrix and classification report
fig, ax = plt.subplots(figsize=(6, 5))
mat = confusion_matrix(y_train, y_pred)
sns.heatmap(
    mat,
    square=True,
    annot=True,
    fmt="d",
    cmap="coolwarm",
    xticklabels=[0, 1],
    yticklabels=[0, 1],
)
ax.set(xlabel="Predicted Label")
ax.set(ylabel="True Label")
plt.show()
print(classification_report(y_train, y_pred))


## 5. Plot decision tree from Training Data

In [None]:
from sklearn.tree import export_graphviz
from IPython.display import Image
from six import StringIO

tree_dot = StringIO()

import pydotplus

response = ["Readmitted", "Not Readmitted"]

export_graphviz(
    clf_trim,
    out_file=tree_dot,
    feature_names=X_train.columns,
    class_names=response,
    rounded=True,
    filled=True,
)

tree_graph = pydotplus.graph_from_dot_data(tree_dot.getvalue())
Image(tree_graph.create_png())


In [None]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone("US/Central")).strftime("%a, %d %B %Y %H:%M:%S")
