## Load NumPy, pandas and time ( No other library/package than the Python 3 standard library, NumPy, pandas and time may be used )

In [1]:
import numpy as np
import pandas as pd
import time


## Reused functions from assignment 1: data preprocessing

In [2]:
# Copy and paste functions from Assignment 1 here that you need for this assignment

def create_normalization(df, normalizationtype="minmax"):
    """
    Make normalization vectors based on training dataset
    """
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    normalization = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
            
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue

        if normalizationtype == "minmax":
            min_value = np.min(df_copy[i])
            max_value = np.max(df_copy[i])
            df_copy[i] = df_copy[i].apply(lambda x: (x-min_value)/(max_value-min_value))
            normalization[i] = (normalizationtype, min_value, max_value)
            
        elif normalizationtype == "zscore":
            mean = df_copy[i].mean()
            std = df_copy[i].std()
            df_copy[i] = df_copy[i].apply(lambda x: (x-mean/std))
            normalization[i] = (normalizationtype, mean, std)
    
    return df_copy, normalization
    
def apply_normalization(df, normalization):
    """
    Apply training information onto test dataset
    """
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    
    #key: column name
    #val: ('method', number1, number2)
    for key, val in normalization.items():
        normalizationtype = val[0]
        
        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        dtype = df_copy[key].dtype
        if key == "ID" or key == "CLASS":
            continue
            
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue
            
        if normalizationtype == "minmax":
            min_value = val[1]
            max_value = val[2]
            df_copy[key] = df_copy[key].apply(lambda x: (x-min_value)/(max_value-min_value))
            
            #Hint 2: apply strong constraint limit [0,1]
            #there are both way and both are working fine
            """
            df_copy.loc[df_copy[key] < 0, key] = 0 
            df_copy.loc[df_copy[key] > 1, key] = 1
            """
            df_copy[key] = df_copy[key].clip(0,1)
            
        elif normalizationtype == "zscore":
            mean = val[1]
            std = val[2]
            df_copy[key] = df_copy[key].apply(lambda x: (x-mean/std))
            
    return df_copy

def create_imputation(df):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    imputation = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
        #Case 1: continuous -> use mean
        if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.floating):
            #Special case: all values are missing
            if np.all(df_copy[i].isnull()):
                criteria = 0
            #regular case
            else:
                criteria = df_copy[i].mean()
        #case 2: categorical -> use mode
        elif hasattr(df_copy[i], 'cat'):
            #Special case: all values are missing
            if np.all(df_copy[i].isnull()):
                criteria = df_copy[i].cat.categories[0]
            #regular case
            else:
                print(df_copy[i][df_copy[i].notnull()])
                criteria = df_copy[i].mode()[0] #always return series
        #case 3: object case -> cannot apply .cat -> use "" when all are missing
        #not sure about dtype == "object" or else
        elif dtype == "object":
            #Special case: all values are missing
            if np.all(df_copy[i].isnull()):
                criteria = ""
            #regular case
            else:
                criteria = df_copy[i].mode()[0] #always return series
        #except object, categorical, numerical -> but there is no case when we load a file
        else:
            print(dtype)

        #apply criteria (use fillna)
        df_copy[i] = df_copy[i].fillna(criteria)
        #add value into imputation dictionary
        imputation[i] = criteria
    
    return df_copy, imputation

def apply_imputation(df, imputation):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    
    #key: column name
    #val: imputation value
    for key, val in imputation.items():
        
        #Hint 2: Constratints handling
        #do not care about ID or CLASS (safe check when applying!)
        if key == "ID" or key == "CLASS": 
            continue
            
        criteria = val
        df_copy[key] = df_copy[key].fillna(criteria)
            
    return df_copy

def create_one_hot(df):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    output = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
            
        #we only need to care about object and categorical values
        #but practically when we load a file, there is no categorical value
        #Hint 2 - Case 1: object
        if dtype == "object":
            #change the type into category to make one hot easier
            df_copy[i] = df_copy[i].astype("category")
        #Hint 2 - Case 2: category
        if hasattr(df_copy[i], 'cat'):
            cats = df_copy[i].cat.categories
            for cat in cats:
                #make new column and make type as float
                df_copy[i+'-'+cat] = (df_copy[i] == cat).astype("float")
            #delete original column
            df_copy.drop(i, axis=1, inplace=True)
            output[i] = cats
    
    return df_copy, output

def apply_one_hot(df, one_hot):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    
    for i, cats in one_hot.items():
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
            
        #Hint 2 - Case 1: category
        if dtype == "object":
            #change the type into category to make one hot easier
            df_copy[i] = df_copy[i].astype("category")
        if hasattr(df_copy[i], 'cat'):
            for cat in cats:
                #make new column and make type as float
                df_copy[i+'-'+cat] = (df_copy[i] == cat).astype("float")
            #delete original column
            df_copy.drop(i, axis=1, inplace=True)
    
    return df_copy

def create_bins(df, nobins=10, bintype="equal-width"):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    binning = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue
        
        #Hint 3 - Case 1: equal width -> cut
        if bintype == "equal-width":
            res, bins = pd.cut(df_copy[i], bins=nobins, labels=False, retbins=True, duplicates="drop")
        #Hint 3 - Case 2: equal size -> qcut
        elif bintype == "equal-size":
            res, bins = pd.qcut(df_copy[i], q=nobins, labels=False, retbins=True, duplicates="drop")
            
        #apply res
        df_copy[i] = res
        
        #Hint 4 - Set column to be of type "category"
        df_copy[i] = df_copy[i].astype("category")
        
        #Hint 5 - set the categories as a number of bins
        df_copy[i] = df_copy[i].cat.set_categories(list(range(len(bins))))
        
        #Hint 6 - set first and last value
        bins[0] = -np.inf
        bins[-1] = np.inf
        
        #set bins on the output
        binning[i] = bins
    
    return df_copy, binning

def apply_bins(df, binning):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    
    for key, val in binning.items():
        dtype = df_copy[key].dtype
        #Hint 2: Constratints handling
        
        #do not care about ID or CLASS (safe check when applying!)
        if key == "ID" or key == "CLASS":
            continue
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue
        
        #Hint 2
        res = pd.cut(df_copy[key], bins=val, labels=False, duplicates="drop")
        df_copy[key] = res
        
        #Hint 3 - Set column to be of type "category"
        df_copy[key] = df_copy[key].astype("category")

        #Hint 4 - set the categories as a number of nobins
        df_copy[key] = df_copy[key].cat.set_categories(list(range(len(val))))
        
    return df_copy

def brier_score(df, correctlabels):
    #setting dictionary
    correctdict = {}
    brier_score = 0
    
    for i in correctlabels:
        if i not in correctdict.keys():
            correctdict[i] = [0]
    
    correctdict = pd.DataFrame(correctdict)

    #loop number of samples
    for cnt, val in enumerate(correctlabels):
        #initialize 0 again
        correctdict.iloc[0] = 0
        #only correct one goes to 1
        correctdict[val] = 1
        #get single row in a prediction
        row = df.iloc[cnt]
        #calculate score
        score = np.sum(np.square(row-correctdict), axis=1)
        brier_score += score
    
    brier_score /= len(correctlabels)
    return brier_score[0]

import collections

def auc(df, correctlabels):
    #make labels
    labels = set(correctlabels)
    
    #assignment 2 fix
    correctlabels = correctlabels.tolist()
    
    counts = [] #for weighted sum (count of class in true population)
    aucs = [] #auc score for each count
    
    #calculate TP/FP for each label
    for label in labels:
        #make scores 
        tot_tp = 0
        tot_fp = 0
        scores = {}
        
        for idx, val in enumerate(correctlabels):
            if df.iloc[idx][label] not in scores.keys():
                scores[df.iloc[idx][label]] = [0, 0]
            if val == label:
                scores[df.iloc[idx][label]][0] += 1
                tot_tp += 1
            else:
                scores[df.iloc[idx][label]][1] += 1
                tot_fp += 1
         
        #Descending sort by its score
        scores = collections.OrderedDict(sorted(scores.items(), reverse=True))
        
        #GET AUC score
        auc_sub = 0
        cov_tp = 0
        
        for key, val in scores.items():
            tp_rate = val[0]
            fp_rate = val[1]
            if fp_rate == 0:
                cov_tp += tp_rate
            elif tp_rate == 0:
                auc_sub += (cov_tp/tot_tp) * (fp_rate/tot_fp)
            else:
                auc_sub += (cov_tp/tot_tp)*(fp_rate/tot_fp) + ((tp_rate/tot_tp)*(fp_rate/tot_fp))/2
                cov_tp += tp_rate
        
        #apply proportion
        counts.append(correctlabels.count(label))
        aucs.append(auc_sub)
        
    auc = np.array(aucs).dot(np.array(counts))/len(correctlabels)

    return auc

def accuracy(df, correctlabels):
    df_copy = df.copy()
    pred = np.empty(len(correctlabels))
    
    df_max = df_copy.max(axis=1) #find the highest value in each row to compare later

    for i in range(len(df)):
        df_tmp = df_copy.iloc[i:i+1]
        for col in df_tmp.columns:
            if(df_tmp[col] >= df_max[i:i+1]).bool(): 
                pred[i] = col
                """
                if break enabled, will pick the first option, 
                else, will leave the last option that equals the highest value, 
                can be randomized with an if and random function
                """
                #1. random mode
                #if np.random.choice([True, False]): break 
                
                #2. picking first one mode
                break
                
    numbercorrect = np.sum(np.array(correctlabels) == pred)
    return numbercorrect/len(correctlabels)

## 1. Define the class kNN

In [3]:
# Define the class kNN with three functions __init__, fit and predict (after the comments):
#
# Input to __init__: 
# self: the object itself
#
# Output from __init__:
# nothing
# 
# This function does not return anything but just initializes the following attributes of the object (self) to None:
# imputation, normalization, one_hot, labels, training_labels, training_data
#
# Input to fit:
# self: the object itself
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# normalizationtype: "minmax" (default) or "zscore"
#
# Output from fit:
# nothing
#
# The result of applying this function should be:
#
# self.imputation should be an imputation mapping (see Assignment 1) from df
# self.normalization should be a normalization mapping (see Assignment 1), using normalizationtype from the imputed df
# self.one_hot should be a one-hot mapping (see Assignment 1; can be excluded if this function was not completed)
# self.training_labels should be a pandas series corresponding to the "CLASS" column, set to be of type "category" 
# self.labels should be the categories of the previous series
# self.training_data should be the values (an ndarray) of the transformed dataframe, i.e., after employing imputation, 
# normalization, and possibly one-hot encoding, and also after removing the "CLASS" and "ID" columns 
# Note that the function does not return anything but just assigns values to the attributes of the object.
#
# Input to predict:
# self: the object itself
# df: a dataframe
# k: an integer >= 1 (default = 5)
# 
# Output from predict:
# predictions: a dataframe with class labels as column names and the rows corresponding to
#              predictions with estimated class probabilities for each row in df, where the class probabilities
#              are estimated by the relative class frequencies in the set of class labels from the k nearest 
#              (with respect to Euclidean distance) neighbors in training_data
#
# Hint 1: Drop any "CLASS" and "ID" columns first and then apply imputation, normalization and (possibly) one-hot
# Hint 2: Get the numerical values (as an ndarray) from the resulting dataframe and iterate over the rows 
#         calling some sub-function, e.g., get_nearest_neighbor_predictions(x_test,k), which for a test row
#         (numerical input feature values) finds the k nearest neighbors and calculate the class probabilities.
# Hint 3: This sub-function may first find the distances to all training instances, e.g., pairs consisting of
#         training instance index and distance, and then sort them according to distance, and then (using the indexes
#         of the k closest instances) find the corresponding labels and calculate the relative class frequencies
class kNN():
    def __init__(self):
        self.imputation = None
        self.normalization = None
        self.one_hot = None
        self.labels = None
        self.training_labels = None
        self.training_data = None
        
    def fit(self, df, normalizationtype="minmax"):
        #we will keep making the training data after copying original df
        self.training_data = df.copy()
        #1: imputation for null/na values
        self.training_data, self.imputation = create_imputation(self.training_data)
        #2: normalization
        self.training_data, self.normalization = create_normalization(self.training_data, normalizationtype)
        #3: one hot encoding
        self.training_data, self.one_hot = create_one_hot(self.training_data)
        #4: set training data as a category type
        self.training_labels = df["CLASS"].astype('category')
        #5: extract labels
        self.labels = self.training_labels.cat.categories
        #6: drop ID and CLASS column to make final training data
        #it is better to check it seperately because in some unknown dataset there can be only one among ID and CLASS
        if "ID" in self.training_data.columns:
            self.training_data.drop(["ID"], inplace=True, axis=1)
        if "CLASS" in self.training_data.columns:
            self.training_data.drop(["CLASS"], inplace=True, axis=1)
    
    def get_nearest_neighbor_predictions(self, x_test, k):
        """
        get k nearest row indices based on euclidian
        """
        length_applied = np.sum(np.square(self.training_data - x_test), axis=1)
        #get k labels based on sorted result
        labels = self.training_labels[length_applied.sort_values().iloc[:k].index]
        #always return series
        label = labels.value_counts()
        #make it as dataframe and return
        return label/k
        
    def predict(self, df, k=5):
        #always copy dataset
        df_copy = df.copy()
        
        #Hint 1: drop id and class
        #it is better to check it seperately because in some unknown dataset there can be only one among ID and CLASS
        if "ID" in df_copy.columns:
            df_copy.drop(["ID"], inplace=True, axis=1)
        if "CLASS" in df_copy.columns:
            df_copy.drop(["CLASS"], inplace=True, axis=1)
        
        #Hint 1: apply imputation, normalization, one hot encoding
        df_copy = apply_imputation(df_copy, self.imputation)
        df_copy = apply_normalization(df_copy, self.normalization)
        df_copy = apply_one_hot(df_copy, self.one_hot)
        
        #Hint 2: sort only numerical data
        df_numeric = df_copy._get_numeric_data()
        
        #Hint 2: find k nearest data
        nearests = df_numeric.apply(self.get_nearest_neighbor_predictions, axis=1, k=k)
        
        return nearests

In [4]:
# Test your code (leave this part unchanged, except for if auc is undefined)

glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

knn_model = kNN()

t0 = time.perf_counter()
knn_model.fit(glass_train_df)
print("Training time: {0:.2f} s.".format(time.perf_counter()-t0))

test_labels = glass_test_df["CLASS"]

k_values = [1,3,5,7,9]
results = np.empty((len(k_values),3))

for i in range(len(k_values)):
    t0 = time.perf_counter()
    predictions = knn_model.predict(glass_test_df,k=k_values[i])
    print("Testing time (k={0}): {1:.2f} s.".format(k_values[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=k_values,columns=["Accuracy","Brier score","AUC"])

results


Training time: 0.01 s.
Testing time (k=1): 0.51 s.
Testing time (k=3): 0.44 s.
Testing time (k=5): 0.43 s.
Testing time (k=7): 0.42 s.
Testing time (k=9): 0.42 s.


Unnamed: 0,Accuracy,Brier score,AUC
1,0.747664,0.504673,0.81035
3,0.663551,0.488058,0.815859
5,0.579439,0.471028,0.833843
7,0.598131,0.471867,0.833481
9,0.616822,0.482981,0.827727


In [5]:
train_labels = glass_train_df["CLASS"]
predictions = knn_model.predict(glass_train_df,k=1)
print("Accuracy on training set (k=1): {0:.2f}".format(accuracy(predictions,train_labels)))
print("AUC on training set (k=1): {0:.2f}".format(auc(predictions,train_labels)))
print("Brier score on training set (k=1): {0:.2f}".format(brier_score(predictions,train_labels)))


Accuracy on training set (k=1): 1.00
AUC on training set (k=1): 1.00
Brier score on training set (k=1): 0.00


### Comment on assumptions, things that do not work properly, etc.


Everything works fine. We followed all constraints and results were same.

- We modified one thing in auc function from assignment 1.
 - because now the correct label is RangeIndex type but in assignment 1, it was just normal python list. 
 - Therefore we put this line in auc function to make it work well (our function was based on python list).

```python
#assignment 2 fix
correctlabels = correctlabels.tolist()
```

## 2. Define the class NaiveBayes

In [6]:
# Define the class NaiveBayes with three functions __init__, fit and predict (after the comments):
#
# Input to __init__: 
# self: the object itself
#
# Output from __init__:
# nothing
# 
# This function does not return anything but just initializes the following attributes of the object (self) to None:
# binning, class_priors, feature_class_value_counts, feature_class_counts
#
# Input to fit:
# self: the object itself
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# nobins: no. of bins (default = 10)
# bintype: either "equal-width" (default) or "equal-size" 
#
# Output from fit:
# nothing
#
# The result of applying this function should be:
#
# self.binning should be a discretization mapping (see Assignment 1) from df
# self.class_priors should be a mapping (dictionary) from the labels (categories) of the "CLASS" column of df,
# to the relative frequencies of the labels
# self.feature_class_value_counts should be a mapping from a feature (column name) to another mapping, which
# given a feature value and class label provides the number of training instances with this specific combination
# self.feature_class_counts should me a mapping from the feature (column name) and class label to the number of
# training instances with this specific class label and any (non-missing) value for the feature
# Note that the function does not return anything but just assigns values to the attributes of the object.
#
# Hint 1: feature_class_value_counts can be a dictionary, which given a feature f returns a mapping obtained 
#         by pandas groupby and size (see lecture slides), which given a feature value v and class label c 
#         returns the number of instances, e.g., using get((c,v),0)
#
# Input to predict:
# self: the object itself
# df: a dataframe
# 
# Output from predict:
# predictions: a dataframe with class labels as column names and the rows corresponding to
# predictions with estimated class probabilities for each row in df, where the class probabilities
# are estimated by the naive approximation of Bayes rule (see lecture slides)
#
# Hint 1: First apply discretization
# Hint 2: Iterating over either columns or rows, and for each possible class label, calculate the relative
#         frequency of the observed feature value given the class (using feature_class_value_counts and 
#         feature_class_counts) 
# Hint 3: Calculate the non-normalized estimated class probabilities by multiplying the class priors to the
#         product of the relative frequencies
# Hint 4: Normalize the probabilities by dividing by the sum of the non-normalized probabilities; in case
#         this sum is zero, then set the probabilities to the class priors
class NaiveBayes():
    def __init__(self):
        self.binning = None
        self.class_priors = None
        self.feature_class_value_counts = None
        self.feature_class_counts = None
    
    
    def fit(self, df, nobins=10, bintype="equal-width"):
        
        #Hint 1. Applying discretization
        df_copy, self.binning = create_bins(df, nobins=nobins, bintype=bintype)
        self.class_priors = df_copy["CLASS"].value_counts()/len(df_copy["CLASS"])
        
        #1. get class unique values
        classes = df_copy["CLASS"].unique()
        
        self.feature_class_value_counts = {}
        self.feature_class_counts = {}
        
        #Hint 2: for each class, get each value sing feature_class_value_counts and feature_class_counts
        for f in df_copy.columns:
            if f == "ID" or f == "CLASS":
                continue
            
            self.feature_class_value_counts[f] = {}
            self.feature_class_counts[f] = {}
            for c in classes:
                temp = 0
                for v in df_copy[f].unique():
                    #self.feature_class_value_counts[f][v] = {}
                    sumclass = ((df_copy["CLASS"] == c) & (df_copy[f] == v)).sum()
                    self.feature_class_value_counts[f][(c,v)] = sumclass
                    temp += sumclass
                
                #The sub dictionary will be the same for all class! (because there is only one class column)
                self.feature_class_counts[f][c] = temp
                
        self.feature_class_value_counts = pd.DataFrame(self.feature_class_value_counts)
        self.feature_class_counts = pd.DataFrame(self.feature_class_counts)

        
    def get_naive_bayes_probability(self, row, classes):
        probs = []
        #looping through the class
        for c in classes:
            rela_freq = 1

            #Hint 3: calculate Naive Bayes probability for each column
            for f in row.index:
                if f == "ID" or f == "CLASS":
                    continue
                v = row[f]

                classprob = self.feature_class_counts[f][c]
                valueprob = self.feature_class_value_counts[f][c,v]
                rela_freq *= valueprob/classprob

            probs.append(rela_freq*self.class_priors[c])
        
        #Hint 4: post processing (normalization)
        probs = pd.Series(probs, index = classes)
        
        #in case the probability sum is zero, then set the probabilities to the class priors
        if probs.sum() != 0:
            probs = probs / probs.sum()
        else:
            probs = self.class_priors
        
        return probs
    
    def predict(self, df):
        #Hint 1: apply binning (discretization)
        df_copy = apply_bins(df, self.binning)
        #get class lists
        classes = df_copy["CLASS"].unique()
        #Hint 2: find 2~3 using apply for performance
        probs = df_copy.apply(self.get_naive_bayes_probability, axis=1, classes=classes)
        
        return probs

In [7]:
# Test your code (leave this part unchanged, except for if auc is undefined)

glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

nb_model = NaiveBayes()

test_labels = glass_test_df["CLASS"]

nobins_values = [3,5,10]
bintype_values = ["equal-width","equal-size"]
parameters = [(nobins,bintype) for nobins in nobins_values for bintype in bintype_values]

results = np.empty((len(parameters),3))

for i in range(len(parameters)):
    t0 = time.perf_counter()
    nb_model.fit(glass_train_df,nobins=parameters[i][0],bintype=parameters[i][1])
    print("Training time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    t0 = time.perf_counter()
    predictions = nb_model.predict(glass_test_df)
    print("Testing time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=pd.MultiIndex.from_product([nobins_values,bintype_values]),
                       columns=["Accuracy","Brier score","AUC"])

results


Training time (3, 'equal-width'): 0.13 s.
Testing time (3, 'equal-width'): 0.39 s.
Training time (3, 'equal-size'): 0.13 s.
Testing time (3, 'equal-size'): 0.34 s.
Training time (5, 'equal-width'): 0.19 s.
Testing time (5, 'equal-width'): 0.37 s.
Training time (5, 'equal-size'): 0.17 s.
Testing time (5, 'equal-size'): 0.34 s.
Training time (10, 'equal-width'): 0.33 s.
Testing time (10, 'equal-width'): 0.36 s.
Training time (10, 'equal-size'): 0.31 s.
Testing time (10, 'equal-size'): 0.37 s.


Unnamed: 0,Unnamed: 1,Accuracy,Brier score,AUC
3,equal-width,0.616822,0.622116,0.724335
3,equal-size,0.607477,0.554782,0.780163
5,equal-width,0.64486,0.551101,0.771688
5,equal-size,0.598131,0.581556,0.796675
10,equal-width,0.654206,0.527569,0.812887
10,equal-size,0.588785,0.741668,0.751165


In [8]:
train_labels = glass_train_df["CLASS"]
nb_model.fit(glass_train_df)
predictions = nb_model.predict(glass_train_df)
print("Accuracy on training set: {0:.2f}".format(accuracy(predictions,train_labels)))
print("AUC on training set: {0:.2f}".format(auc(predictions,train_labels)))
print("Brier score on training set: {0:.2f}".format(brier_score(predictions,train_labels)))

Accuracy on training set: 0.85
AUC on training set: 0.97
Brier score on training set: 0.23


### Comment on assumptions, things that do not work properly, etc.
Everything works fine. We followed all constraints and results were same.

- Regarding hint 1 for Fit function, we didn't use the GroupBy() function to calculate the feature_class_value_counts and feature_class_counts. 
- Instead, we used pandas value_counts() function to acheive the same results, because it is neat :)