In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from statsmodels.discrete.discrete_model import Logit
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [1]:
def get_answers_counts(data_subset):
    for idx, col in enumerate(data_subset.columns):
        print(questions[col]['text'])
        print('Count:' + '\t' + 'Answer:')
        val_cnts = data_subset[col].value_counts()
        for pos, ind in enumerate(val_cnts.index):
            count = val_cnts.values[pos]
            if col not in numeric_col:
                print(str(count) + '\t' + questions[col][ind] )
            else:
                print(str(count) + '\t' + str(ind))

In [None]:
def barplot(data, idx, col, title, numeric_cols, questions):
    plt.figure(idx)
    if col not in numeric_cols:
        val_cnts = data[col].value_counts()
        text = [questions[col][val] for val in val_cnts.index]
        # replace x and y with each other for vertical plot
        ax = sns.barplot(y=text, x=val_cnts.values)
        ax.set_title(title)
        for ind, val in enumerate(val_cnts):
            ax.text(val, ind, val, color='black', ha="center")
        #for plot vertical plot
        #ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    else:
        ax = sns.histplot(x=data[col])
        ax.set_title(title)
        ax.set_xlabel('Age')

In [None]:
def perform_mapping(model_data, questions_map):
    for index, row in model_data.iterrows():
        for col in model_data.columns:
            if np.isnan(row[col]) or row[col]==0:
                continue
            row[col] = questions_map[col][int(row[col])]
    return model_data

In [None]:
def perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance=None):
    if balance=='under':
        rus = RandomUnderSampler(random_state=0)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    elif balance=='over':
        ros = RandomOverSampler(random_state=0)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    elif balance=='smoteenn':
        smote_enn = SMOTEENN(random_state=0)
        X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
    X_train_df = pd.DataFrame(data=X_train_resampled, columns=X_cols)
    y_train_df = pd.DataFrame(data=y_train_resampled, columns=y_col)
    logit = Logit(y_train_df, X_train_df)
    lg = logit.fit()
    print(lg.summary())
    yhat = lg.predict(X_test)
    prediction = list(map(round, yhat))
    get_metrics(y_test, prediction)

In [None]:
def perform_random_forest(X_train, X_test, y_train, y_test, balance=None):
    if balance=='under':
        rus = RandomUnderSampler(random_state=0)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    elif balance=='over':
        ros = RandomOverSampler(random_state=0)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    elif balance=='smoteenn':
        smote_enn = SMOTEENN(random_state=0)
        X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
    criterion = ['gini', 'entropy']
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'criterion': criterion,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                                  n_iter = 100, scoring='accuracy', 
                                  cv = 7, verbose=2, random_state=42, n_jobs=-3,
                                  return_train_score=True)
    rf_random.fit(X_train_resampled, y_train_resampled)
    pprint(rf_random.best_params_)
    prediction = rf_random.best_estimator_.predict(X_test)
    get_metrics(y_test, prediction)

In [None]:
def perform_decision_tree(X_train, X_test, y_train, y_test, balance=None):
    if balance=='under':
        rus = RandomUnderSampler(random_state=0)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    elif balance=='over':
        ros = RandomOverSampler(random_state=0)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    elif balance=='smoteenn':
        smote_enn = SMOTEENN(random_state=0)
        X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
    criterion = ['gini', 'entropy']
    splitter = ['best', 'random']
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    random_grid = {'criterion': criterion,
                   'splitter': splitter,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    dtc = DecisionTreeClassifier()
    dtc_random = RandomizedSearchCV(estimator=dtc, param_distributions=random_grid,
                                  n_iter = 100, scoring='accuracy', 
                                  cv = 7, verbose=2, random_state=42, n_jobs=-3,
                                  return_train_score=True)
    dtc_random.fit(X_train_resampled, y_train_resampled)
    pprint(dtc_random.best_params_)
    prediction = dtc_random.best_estimator_.predict(X_test)
    get_metrics(y_test, prediction)

In [None]:
def get_metrics(y_true, prediction):
    print(np.unique(y_true, return_counts=True))
    print('Test ROC AUC: ', roc_auc_score(y_true, prediction))
    print('Test accuracy: ', accuracy_score(y_true, prediction))
    print('Test precision: ', precision_score(y_true, prediction))
    print('Test recall: ', recall_score(y_true, prediction))
    print('Test F1 score: ', f1_score(y_true, prediction))
    print('Test confusion matrix: ')
    print(confusion_matrix(y_true, prediction))