<a href="https://colab.research.google.com/github/talktokorea/Anomaly_Detection/blob/main/bin_roc_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Parameters for the synthetic data
scale = 5
size = 500

In [None]:
# Generate class 1 out of a normal distribution
class1_a = np.random.normal(loc = 10, scale = scale, size = size)
class1_b = np.random.normal(loc = 1, scale = scale, size = size)

In [None]:
#print(class1_a)
#print(class1_b)

In [None]:
# Generate class 2 out of a normal distribution
class2_a = np.random.normal(loc = 1, scale = scale, size = size)
class2_b = np.random.normal(loc = 5, scale = scale, size = size)

In [None]:
# Plot both to show how much they intersect
plt.figure(figsize=(5, 5))
sns.scatterplot(x = class1_a, y = class1_b)
sns.scatterplot(x = class2_a, y = class2_b)

In [None]:
df_class1 = pd.DataFrame()
df_class1['a'] = class1_a
df_class1['b'] = class1_b
df_class1['class'] = 0

In [None]:
df_class2 = pd.DataFrame()
df_class2['a'] = class2_a
df_class2['b'] = class2_b
df_class2['class'] = 1

In [None]:
df = df_class1.append(df_class2, ignore_index = True)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# Get the dependent and independent variables
X = df.drop(columns = ['class'])
y = df['class']

In [None]:
# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# Create the model object
model = GaussianNB()

In [None]:
# Fit the model to the training data
model.fit(X_train, y_train)

In [None]:
# Predict the classes on the test data
y_pred = model.predict(X_test)

In [None]:
# Show the first 10 entries
y_pred[:10]

In [None]:
# Predict the classes on the test data, and return the probabilities for each class
y_proba = model.predict_proba(X_test)

In [None]:
# Show the first 10 entries
y_proba[:10]

In [None]:
# Show the predictions
sns.scatterplot(x = X_test['a'], y = X_test['b'], hue = y_pred)

In [None]:
df_aux = X_test.copy()

In [None]:
print(df_aux)

In [None]:
print(y_test)

In [None]:
df_aux['class']=[1 if y==1 else 0 for y in y_test]

In [None]:
print(df_aux)

In [None]:
df_aux['prob']=y_proba[:,1]   # cf. y_prob[:,0]
#print(y_proba)

In [None]:
#print(df_aux)

In [None]:
bins = [i/20 for i in range(20) ] + [1]
sns.histplot(data=df_aux,x="prob",hue="class",bins=bins)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
def calculate_tpr_fpr(y_real, y_pred):
    '''
    Calculates the True Positive Rate (tpr) and the True Negative Rate (fpr) based on real and predicted observations
    
    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes
        
    Returns:
        tpr: The True Positive Rate of the classifier
        fpr: The False Positive Rate of the classifier
    '''
    
    # Calculates the confusion matrix and recover each element
    cm = confusion_matrix(y_real, y_pred)
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]
    
    # Calculates tpr and fpr
    tpr =  TP/(TP + FN) # sensitivity - true positive rate
    fpr = 1 - TN/(TN+FP) # 1-specificity - false positive rate
    
    return tpr, fpr

In [None]:
def get_n_roc_coordinates(y_real, y_proba, resolution = 50):
    '''
    Calculates "n" ROC Curve coordinates (tpr and fpr) by manipulating the threshold used to predict the class.
    
    Args:
        y_real: The list or series with the real classes.
        y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.
        resolution: Defines how many divisions the threshold will have, and how many coordinates will be calculated (default = 50).
        
    Returns:
        tpr_list: The list of TPRs representing each threshold.
        fpr_list: The list of FPRs representing each threshold.
    '''
    tpr_list = [0]
    fpr_list = [0]
    for i in range(resolution):
        threshold = i/resolution
        y_pred = y_proba[:, 1] > threshold
        tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    return tpr_list, fpr_list

In [None]:
def plot_roc_curve(tpr, fpr, scatter = True):
    '''
    Plots the ROC Curve by using the list of coordinates (tpr and fpr).
    
    Args:
        tpr: The list of TPRs representing each coordinate.
        fpr: The list of FPRs representing each coordinate.
        scatter: When True, the points used on the calculation will be plotted with the line (default = True).
    '''
    plt.figure(figsize = (5, 5))
    if scatter:
        sns.scatterplot(x = fpr, y = tpr)
    sns.lineplot(x = fpr, y = tpr)
    sns.lineplot(x = [0, 1], y = [0, 1], color = 'green')
    plt.xlim(-0.05, 1.05)
    plt.ylim(-0.05, 1.05)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

In [None]:
# Calculates 10 coordinates of the ROC Curve
tpr, fpr = get_n_roc_coordinates(y_test, y_proba, resolution = 10)

In [None]:
# Plots the ROC curve
plot_roc_curve(tpr, fpr)

In [None]:
def get_all_roc_coordinates(y_real, y_proba):
    '''
    Calculates all the ROC Curve coordinates (tpr and fpr) by considering each point as a treshold for the predicion of the class.
    
    Args:
        y_real: The list or series with the real classes.
        y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.
        
    Returns:
        tpr_list: The list of TPRs representing each threshold.
        fpr_list: The list of FPRs representing each threshold.
    '''
    tpr_list = [0]
    fpr_list = [0]
    for i in range(len(y_proba)):
        threshold = y_proba[i, 1]
        y_pred = y_proba[:, 1] >= threshold
        tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    return tpr_list, fpr_list

In [None]:
# Calculates ALL coordinates of the ROC Curve
tpr, fpr = get_all_roc_coordinates(y_test, y_proba)

In [None]:
# Plots the ROC curve
plot_roc_curve(tpr, fpr, scatter = False)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

In [None]:
def plot_sklearn_roc_curve(y_real, y_pred):
    '''
    Plots the ROC Curve with the sklearn methods by using the real observations and their predictions.
    
    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes
    '''
    fpr, tpr, _ = roc_curve(y_real, y_pred)
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    roc_display.figure_.set_size_inches(5,5)

In [None]:
# Plots the ROC curve using the sklearn methods
plot_sklearn_roc_curve(y_test, y_proba[:, 1])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
def evaluate_classifier(y_real, y_pred):
    '''
    Prints the accuracy, precision, recall and roc auc scores for the classifier.
    
    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes
    '''
    print(f"Accuracy: {accuracy_score(y_real, y_pred):.4f}")
    print(f"Precision: {precision_score(y_real, y_pred):.4f}")
    print(f"Recall: {recall_score(y_real, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_real, y_pred):.4f}")

In [None]:
evaluate_classifier(y_test, y_pred)