In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, auc
import torch

In [114]:
def data_split(df=None, label=None, validation=False, train_size=0.8, random_state=42, tensor=False):
    if validation == False and tensor == False:
        x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,df.columns != label], df.iloc[:,df.columns == label], 
                                                            test_size=(1-train_size), random_state=random_state)
        return x_train, x_test, y_train, y_test
    elif validation == True and tensor == True:
        x_train, x_val_te, y_train, y_val_te = train_test_split(df.iloc[:,df.columns != label], df.iloc[:,df.columns == label], 
                                                            test_size=(1-train_size), random_state=random_state)
        x_val, x_test, y_val, y_test = train_test_split(x_val_te, y_val_te, 
                                                            test_size=0.5, random_state=random_state)
        X_train = torch.Tensor(x_train.values)
        X_val = torch.Tensor(x_val.values)
        X_test = torch.Tensor(x_test.values)
        Y_train = torch.Tensor(y_train.values)
        Y_val = torch.Tensor(y_val.values)
        Y_test = torch.Tensor(y_test.values)
        return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [115]:
def data_discretize(df=None, feat_continous=None, num_cat=None):
    df_disct = df.copy()
    for col in feat_continous:
        gap = (getattr(df_disct, col).max() - getattr(df_disct, col).min())/num_cat
        df_disct[col] = ((getattr(df_disct, col) - getattr(df_disct, col).min()) / gap).round(decimals=0).astype(int)

    for col in df_disct.columns:
        temp = getattr(df_disct, col).min()
        if temp < 0 and col != 'default':
            df_disct[col] = getattr(df_disct, col) - temp

    return df_disct

In [116]:
def point_eval_metric(conf_m=None, data=None, model=None, y_true=None, y_score=None):
    """ Given confusion matrix, generate point evaluation metrics

    Args:
        conf_m: confusion matrix
        data: continous or discrete
        model: str
        y_true: true target values
        y_score: probability of predicted target
    
    Returns:
        DataFrame with info: 
            - model, test_size, prevalence, acc_tot, acc_pos, acc_neg, prec, recall, f1, auc-roc
    """
    if model.lower() == 'lr':
        model = 'LogisticReg'
    elif model.lower() == 'nb':
        model = 'NaiveBayes'
    elif model.lower() == 'svm':
        model = 'SVM'
    elif model.lower() == 'rf':
        model = 'Random Forest'
    elif model.lower() == 'dt':
        model = 'Decision Tree'
    elif model.lower() == 'gda':
        model = 'Gaussian Analysis'
    else:
        model = model

    model = model + ' - ' + data

    tn, fp, fn, tp = conf_m[0][0], conf_m[0][1], conf_m[1][0], conf_m[1][1]
    data =  {'Model': [model],
             'Test Size': [tn + fn + fp + tp],
             'Prevalence': [format((tp + fn) / (tn + fn + fp + tp), '.2%')],
             'Total Accuracy': [format((tp + tn) / (tn + fn + fp + tp), '.2%')],
             'Positive Accuracy': [format(tp / (tp + fn), '.2%')],
             'Negative Accuracy': [format(tn / (tn + fp), '.2%')],
             'Precision': [format(tp / (tp+fp), '.2%')],
             'Recall': [format(tp / (tp+fn), '.2%')],
             'F1-Score': [format(2*((tp / (tp+fp)) * (tp / (tp+fn))) / ((tp / (tp+fp)) + (tp / (tp+fn))), '.2%')],
             'AUC-ROC': [format(roc_auc_score(y_true, y_score), '.4')]
            }
    
    return pd.DataFrame.from_dict(data)

In [117]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.tree import DecisionTreeClassifier

def BaseLine(model=None, data=None, x_train=None, y_train=None, x_test=None, y_test=None):
    """ Fit/predict baseline models and generate confusion matrix/claffication report
    
    Args:
        model: str, four options: LR, GDA, NB, DT, SVM
        data: continuous or discrete
        x_train: DataFrame, training dataset of features
        y_train: DataFrame, training dataset of labels
        x_test: DataFrame, testing dataset of features
        y_test: DataFrame, testing dataset of labels

    Returns:
        DataFrame, with evaluation metrics
    """
    if model.lower() == 'lr':
        clf = LogisticRegression()
    elif model.lower() == 'gda':
        clf = GaussianNB()
    elif model.lower() == 'nb':
        clf = CategoricalNB(class_prior=[0.5, 0.5])
    elif model.lower() == 'dt':
        clf = DecisionTreeClassifier()
    elif model.lower() == 'svm':
        clf = svm.SVC(probability=True)
    clf.fit(x_train, np.ravel(y_train))
    y_pred = clf.predict(x_test)
    y_pred_proba = clf.predict_proba(x_test)[:, 1]
    conf_m = confusion_matrix(y_test, y_pred)
    return point_eval_metric(conf_m=conf_m, model=model, data=data, y_true=y_test, y_score=y_pred_proba), y_pred_proba

In [118]:
df = pd.read_csv('default_of_credit_card_clients.csv')
df.drop(columns=['ID'],inplace=True)
target = 'default_payment_next_month'
target_rename = 'default'
df.rename(columns={target:target_rename}, inplace=True)
target=target_rename
feat_disc_threshold = 11
feat_discrete = [col for col in df.columns if getattr(df, col).nunique() <= feat_disc_threshold and col != target]
feat_continous = [col for col in df.columns if col not in feat_discrete and col != target]

In [119]:
df_disct = data_discretize(df=df, feat_continous=feat_continous, num_cat=9)
x_train_disct, x_test_disct, y_train_disct, y_test_disct = data_split(df=df_disct, label=target)

In [120]:
nb_disct, nb_disct_proba = BaseLine(model='nb', data='discrete',
                                        x_train=x_train_disct, y_train=y_train_disct, x_test=x_test_disct, y_test=y_test_disct)

In [105]:
nb_disct

Unnamed: 0,Model,Test Size,Prevalence,Total Accuracy,Positive Accuracy,Negative Accuracy,Precision,Recall,F1-Score,AUC-ROC
0,NaiveBayes - discrete,6000,21.88%,78.08%,54.61%,84.66%,49.93%,54.61%,52.16%,0.7546


In [98]:
from sklearn.naive_bayes import CategoricalNB

In [99]:
clf = CategoricalNB(class_prior=[0.5, 0.5])
clf.fit(x_train_disct, np.ravel(y_train_disct))

In [101]:
y_pred = clf.predict(x_test_disct)
y_pred_proba = clf.predict_proba(x_test_disct)[:, 1]

In [91]:
clf = CategoricalNB(class_prior=[0.5, 0.5])
clf.fit(x_train_disct, np.ravel(y_train_disct))

In [88]:
print(clf.class_prior)

None


In [92]:
y_pred = clf.predict(x_test_disct)

In [121]:
res_train = []
for col in x_train_disct.columns:
    res_train.append(getattr(x_train_disct, col).nunique())
res_train

[9, 2, 7, 4, 10, 11, 11, 11, 11, 10, 10, 9, 9, 7, 9, 9, 8, 7, 8, 7, 10, 10, 10]

In [122]:
res_test = []
for col in x_test_disct.columns:
    res_test.append(getattr(x_test_disct, col).nunique())
res_test

[8, 2, 7, 4, 9, 10, 9, 11, 10, 9, 9, 7, 7, 4, 7, 6, 6, 4, 3, 4, 5, 6, 8]

In [123]:
test = [0.1, 0.5]
type(test)

list

In [124]:
str(test)

'[0.1, 0.5]'

In [125]:
str(None)

'None'