In [2]:
import time
import datetime
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from statsmodels.discrete.discrete_model import Logit
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from pprint import pprint
import matplotlib.pyplot as plt

In [None]:
# DIR = '/content/drive/MyDrive/DentistDataAnalysis/Experiments/'
# IMG_DIR = '/content/drive/MyDrive/DentistDataAnalysis/Experiments/images/'
# FIG_DIR = '/content/drive/MyDrive/DentistDataAnalysis/Experiments/figures/'
DIR = ''
IMG_DIR = 'images/'
FIG_DIR = 'figures/'

In [1]:
def get_answers_counts(data_subset):
    for idx, col in enumerate(data_subset.columns):
        print(questions[col]['text'])
        print('Count:' + '\t' + 'Answer:')
        val_cnts = data_subset[col].value_counts()
        for pos, ind in enumerate(val_cnts.index):
            count = val_cnts.values[pos]
            if col not in numeric_col:
                print(str(count) + '\t' + questions[col][ind] )
            else:
                print(str(count) + '\t' + str(ind))

In [None]:
def barplot(col, data, title, numeric_cols, questions):
    fig, ax = plt.subplots()
    plot_str = 'barplot'
    if col not in numeric_cols:
        val_cnts = data[col].value_counts()
        text = [questions[col][val] for val in val_cnts.index]
        # replace x and y with each other for vertical plot
        ax = sns.barplot(y=text, x=val_cnts.values)
        ax.set_title(col+': '+title)
        for ind, val in enumerate(val_cnts):
            ax.text(val, ind, val, color='black', ha="center")
        #for plot vertical plot
        #ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    else:
        ax = sns.histplot(x=data[col])
        ax.set_title(col+': '+title)
        ax.set_xlabel('Age')
        plot_str = 'histplot'
    fig.savefig(FIG_DIR+col+'_'+plot_str+'.png', bbox_inches='tight', dpi=fig.dpi)

In [None]:
def perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance=None):
    if balance=='under':
        rus = RandomUnderSampler(random_state=0)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    elif balance=='over':
        ros = RandomOverSampler(random_state=0)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    elif balance=='smoteenn':
        smote_enn = SMOTEENN(random_state=0)
        X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
    X_train_df = pd.DataFrame(data=X_train_resampled, columns=X_cols)
    y_train_df = pd.DataFrame(data=y_train_resampled, columns=y_col)
    logit = Logit(y_train_df, X_train_df)
    lg = logit.fit()
    print(lg.summary())
    yhat = lg.predict(X_test)
    prediction = list(map(round, yhat))
    get_metrics(y_test, prediction)

In [None]:
def perform_random_forest(X_train, X_test, y_train, y_test, balance=None):
    if balance=='under':
        rus = RandomUnderSampler(random_state=0)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    elif balance=='over':
        ros = RandomOverSampler(random_state=0)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    elif balance=='smoteenn':
        smote_enn = SMOTEENN(random_state=0)
        X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
        
    n_estimators = [int(x) for x in np.linspace(start = 5, stop = 150, num = 15)]
    criterion = ['gini', 'entropy']
    max_features = ['sqrt']
    max_depth = [int(x) for x in np.linspace(10, 100, num = 6)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True]
    random_grid = {'n_estimators': n_estimators,
                   'criterion': criterion,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    rf_scoring='accuracy'
    rf_cv = 5
    rf_verbose=1
    rf_n_jobs=-1
    rf_return_train_score=True
    rf = RandomForestClassifier()
    rf_grid = GridSearchCV(estimator=rf, 
                            param_grid=random_grid,
                            scoring=rf_scoring, cv=rf_cv, 
                            verbose=rf_verbose, n_jobs=rf_n_jobs, 
                            return_train_score=rf_return_train_score)
    start_time = time.time()
    rf_grid.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    conv_time = datetime.timedelta(seconds=end_time-start_time)
    print(f"Grid search time: {conv_time}")
    pprint(rf_grid.best_params_)
    prediction = rf_grid.best_estimator_.predict(X_test)
    get_metrics(y_test, prediction)
    start_time = time.time()
    importances = rf_grid.best_estimator_.feature_importances_
    std = np.std([
        rf_grid.best_estimator_.feature_importances_ for tree in rf_grid.best_estimator_.estimators_], axis=0)
    elapsed_time = time.time() - start_time
    print(f"Elapsed time to compute the importances: " 
          f"{elapsed_time:.3f} seconds")
    forest_importances = pd.Series(importances, index=X_cols)
    fig, ax = plt.subplots()
    forest_importances.plot.bar(yerr=std, ax=ax)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()


In [None]:
def perform_decision_tree(X_train, X_test, y_train, y_test, balance=None):
    if balance=='under':
        rus = RandomUnderSampler(random_state=0)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    elif balance=='over':
        ros = RandomOverSampler(random_state=0)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    elif balance=='smoteenn':
        smote_enn = SMOTEENN(random_state=0)
        X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
        
    criterion = ['gini', 'entropy']
    splitter = ['best', 'random']
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 50, num = 6)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    random_grid = {'criterion': criterion,
                   'splitter': splitter,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    dtc_scoring='accuracy'
    dtc_cv = 5
    dtc_verbose=1
    dtc_n_jobs=-1
    dtc_return_train_score=True
    dtc = DecisionTreeClassifier()
    dtc_grid = GridSearchCV(estimator=dtc, 
                            param_grid=random_grid,
                            scoring=dtc_scoring, cv=dtc_cv, 
                            verbose=dtc_verbose, n_jobs=dtc_n_jobs, 
                            return_train_score=dtc_return_train_score)
    start_time = time.time()
    dtc_grid.fit(X_train_resampled, y_train_resampled)
    end_time = time.time()
    conv_time = datetime.timedelta(seconds=end_time-start_time)
    print(f"Grid search time: {conv_time}")
    pprint(dtc_grid.best_params_)
    prediction = dtc_grid.best_estimator_.predict(X_test)
    get_metrics(y_test, prediction)

In [None]:
def get_metrics(y_true, prediction):
    print(np.unique(y_true, return_counts=True))
    print('Test ROC AUC: ', roc_auc_score(y_true, prediction))
    print('Test accuracy: ', accuracy_score(y_true, prediction))
    print('Test precision: ', precision_score(y_true, prediction))
    print('Test recall: ', recall_score(y_true, prediction))
    print('Test F1 score: ', f1_score(y_true, prediction))
    print('Test confusion matrix: ')
    print(confusion_matrix(y_true, prediction))