In [3]:
import numpy as np
import pandas as pd
import time

In [4]:
def create_column_filter(df):
    new_df = df.copy()
    column_filter = df.copy()
    
    all_columns = new_df.columns
    saved_columns = []
    
    for column_name in new_df.columns:
        if column_name == 'CLASS':
            continue
        column_value = new_df.loc[:, column_name]
        values = []
        for item in column_value:
            if pd.isna(item):
                continue
            else:
                values.append(item)
        if len(set(values)) > 1 or len(values) == 1:
            saved_columns.append(column_name)
            
    deleted_colums = list(set(all_columns) ^ set(saved_columns))
    deleted_colums.remove('CLASS')
    
    for item in deleted_colums:
        new_df.pop(item)
    
    for item in saved_columns:
        column_filter.pop(item)
    return new_df, column_filter

def apply_column_filter(df, column_filter):
    new_df = df.copy()
    all_columns = list(new_df.columns)
    all_columns.remove('CLASS')
    
    remained_colums = list(set(all_columns) ^ set(column_filter))
    for item in remained_colums:
        new_df.pop(item)
    return new_df

def create_normalization(df, normalizationtype="minmax"):
    normalized_df = df
    normalization = {}
#     display(normalized_df)
    if normalizationtype == "minmax":
        print("minmax mode")
        normalized_df = (df-df.min())/(df.max()-df.min())
        display(df) 
        for index, row in df.iteritems():
#             print(index,row)
            if index in ["ID","CLASS"]:
                continue
            normalization[index]=("minmax",row.min(),row.max())
    if normalizationtype == "zscore":
        print("zscore mode")
        normalized_df = (df-df.mean())/df.std()
        for index, row in df.iteritems():
#             display(index,row)
            if index in ["ID","CLASS"]:
                continue
            normalization[index]=("minmax",row.mean(),row.std())
    normalized_df['ID'] = df['ID']
    normalized_df['CLASS'] = df['CLASS']
#     display(normalized_df)
    return normalized_df, normalization

def apply_normalization(df, normalization):
    df2 = df
    for index, row in df.iteritems():
        if index in ["ID","CLASS"]:
            continue
        n = normalization[index]
        if n[0] == "minmax":
            df2[index] = (row-n[1])/(n[2]-n[1])
        if n[0] == "zscore":
            df2[index] = (row-n[1])/n[2]
    return df2

def create_imputation(dataframe):
    my_df = dataframe.copy()
    imputation = {}
    
    for column in my_df.columns:
        ## the column names "CLASS" and "ID" have special meaning
        if column != "CLASS" and column != "ID":
            
            ## 如果是数字，用平均数代替
            if my_df[column].dtypes == "int" or my_df[column].dtypes == "float":
                my_df[column].fillna(my_df[column].mean(), inplace=True)
                imputation[column] = my_df[column].mean()
            ## 如果是类别，用众数代替
            else:
                my_df[column].fillna(my_df[column].mode()[0], inplace=True)
                imputation[column] = my_df[column].mode()[0]
    return my_df, imputation

def apply_imputation(df, imputation):
    my_df = df.copy()
    for column in imputation:
        my_df[column].fillna(imputation[column], inplace=True)
    return my_df

def create_bins(df, nobins=10, bintype="equal-width"):
    my_df = df.copy()
    binning = {}
    
    for column in my_df.columns:
        if column != "CLASS" and column != "ID" and (my_df[column].dtype in ["float64", "float32", "int64", "int32"]):
            if bintype == "equal-width":
                my_df[column], bins = pd.cut(my_df[column], nobins, retbins=True, duplicates="drop", labels=False)
                ## 记录分类的区间
                binning[column] = bins
            elif bintype == "equal-size":
                my_df[column], bins = pd.qcut(my_df[column], q=nobins, retbins=True, duplicates="drop", labels=False)
                binning[column] = bins
            ## Set all columns in the new dataframe to be of type "category"
            my_df[column] = my_df[column].astype("category")
            my_df[column] = my_df[column].cat.set_categories([str(i) for i in my_df[column].cat.categories], rename=True)
            binning[column][0] = -np.inf
            binning[column][-1] = np.inf
        else:
            my_df[column] = my_df[column].astype('category')
    return my_df, binning

def apply_bins(df, binning):
    my_df = df.copy()
    bins = {}
    for column in binning:
        my_df[column] = pd.cut(my_df[column], binning[column], labels=False)
        my_df[column] = my_df[column].astype("category")
        my_df[column] = my_df[column].cat.set_categories([str(i) for i in my_df[column].cat.categories], rename = True)        
    my_df = my_df.astype("category")
    return my_df

def create_one_hot(df):
    my_df = df.copy()
    df_new = df.copy()
    one_hot = {}
    for col in df.columns:
        if col != "CLASS" and col != "ID":  
            if str(my_df.dtypes[col]) == "category" or str(my_df.dtypes[col]) == "object":
                my_df[col] = my_df[col].astype("category")
               # one_hot[col] = df[col].cat.categories
                one_hot[col] = list(my_df[col].cat.categories)
                for i in one_hot[col]:
                    name = col + "_" + str(i)
                    # print(name)
                    new_col = my_df[col] == i
                    new_col = new_col.astype("float")
                    df_new[name] = new_col 
                df_new = df_new.drop(columns = col, axis = 1) 

    return df_new, one_hot

def split(dataframe, testfraction=0.5):
    
    df = dataframe.copy()
    
    df_random = df.reindex(np.random.permutation(df.index))
    
    trainingdf = df_random[0: int((1-testfraction)*df.shape[0])]
    testdf = df_random[int((1-testfraction)*df.shape[0])+1 : df.shape[0]]
    
    return trainingdf, testdf

def apply_one_hot(dataframe, one_hot):
    df = dataframe.copy()
    df_new = df.copy()
    for col in df.columns:
        if col in one_hot.keys():
            for i in one_hot[col]:
                name = col + "_" + str(i)
                new_col = df[col] == i
                new_col = pd.Series(new_col.astype("float"))
                df_new[name] = new_col
            df_new = df_new.drop(columns = col, axis = 1)

    return df_new

def split(dataframe, testfraction=0.5):
    
    df = dataframe.copy()
    # display(df)
    
    df_random = df.reindex(np.random.permutation(df.index))
    # display(df_random)
    
    trainingdf = df_random[0: int((1-testfraction)*df.shape[0])]
    testdf = df_random[int((1-testfraction)*df.shape[0])+1 : df.shape[0]]
    
    return trainingdf, testdf

def accuracy(dataframe, correctlabels):
    df = dataframe.copy()
    
    # 返回第一次出现的最大索引
    labels = df.idxmax(axis=1)
    display(labels)
    truelabels = (labels == correctlabels).sum(axis=0)
    accuracy = truelabels/len(df)
    return accuracy

# def folds(df,nofolds = 10):
#     indexes = np.random.permutation(df.index)
#     indexes = np.array_split(indexes,nofolds)
#     newarr = []
#     for i in range(0,nofolds):
#         newdf = pd.DataFrame(columns=df.columns)
#         for row in indexes[i]:
#             newdf = pd.concat([newdf,df[row:row+1]])
#         newarr.append(newdf)
#     return newarr

# def brier_score(df,labels):
#     l = len(labels)
#     newdf = df.copy()
#     for r in df.itertuples():
#         real = labels[r.Index]
#         for n in df.columns:
#             v = getattr(r,n)
#             if n is real:
#                 newdf.at[r.Index,n] = ((v-1)**2)
#             else:
#                 newdf.at[r.Index,n] = ((v-0)**2)
#     return (newdf.sum().sum()/l)

# def auc(df,labels):
#     f = []
#     for r in df.itertuples():
#         real = labels[r.Index]
#         for n in df.columns:
#             v = getattr(r,n)
#             if n is real:
#                 f.append((v,1))
#             else:
#                 f.append((v,0))

#     posNum = 0
#     negNum = 0
#     for i in f:
#         if(i[1]==1):
#             posNum+=1
#         else:
#             negNum+=1

#     rank = [v for _,v in sorted(f,key=lambda x:x[0])]
#     rankList = [i+1 for i in range(len(rank)) if rank[i]==1]
    
#     auc = 0
#     auc = (sum(rankList)- (posNum*(posNum+1))/2)/(posNum*negNum)
#     return auc

def folds(dataframe,nofolds=10):
    
    df = dataframe.copy()
    np.random.permutation(df.index) 
    folds = []
    for i in range(nofolds):
        folds.append(df[int(len(df)*i/nofolds) : int(len(df)*(i+1)/nofolds)])

    return folds


def brier_score(dataframe, corretlabels):
    
    df = dataframe.copy()
    correct_df = pd.get_dummies(corretlabels)
    brier_score = np.mean(np.sum((df - correct_df)**2, axis=1))
    
    return brier_score


# ROC_Henrik's way

def count_tp_fp(predictions_df, correctlabels):
    #print('predictions_df', predictions_df) # last column includes the real values

    Score = predictions_df.iloc[:, 0]
    #print('Score=', Score)

    sorted_unique_score = np.unique(Score)[::-1]
    #print('sorted_unique_score = ', sorted_unique_score)

    pos = np.zeros(len(sorted_unique_score))
    neg = np.zeros(len(sorted_unique_score))

    for s in range(len(sorted_unique_score)):
        for p in range(len(predictions_df)):
            if(sorted_unique_score[s] == predictions_df.iloc[p, 0]):
                if(predictions_df.columns[0] == correctlabels[p]):
                    pos[s] += 1
                else:
                    neg[s] += 1           
    
    #print('pos=', pos)
    #print('neg=', neg)
    
    #draw_ROC(pos, neg)
    
    return pos, neg

def draw_ROC(pos, neg):
    import matplotlib.pyplot as plt
    tpr = [cs/sum(pos) for cs in np.cumsum(pos)]
    print('tpr=', tpr)    
    fpr = [cs/sum(neg) for cs in np.cumsum(neg)]
    print('fpr=', fpr)
    plt.plot([0.0]+fpr+[1.0],[0.0]+tpr+[1.0],"-",label="1")
    plt.plot([0.0,1.0],[0.0,1.0],"--",label="Baseline")
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.legend()
    plt.show()

def calculate_AUC_Henrik_2(pos, neg):
    # AUC = Area under ROC curve
    AUC = 0
    Cov_tp = 0
    n_tp = len(pos)
    Tot_tp = sum(pos)
    Tot_fp = sum(neg)
    
    for i in range(n_tp):
        #print('i={}...pos[i]={}...neg[i]={}'.format(i, pos[i], neg[i]))        
        if(neg[i] == 0):
            Cov_tp += pos[i]
            #print('AUC_if = ', AUC)
        elif(pos[i] == 0):
            AUC += (Cov_tp/Tot_tp)*(neg[i]/Tot_fp)
            #print('AUC_elif = ', AUC)
        else:
            AUC += (Cov_tp/Tot_tp)*(neg[i]/Tot_fp) + (pos[i]/Tot_tp)*(neg[i]/Tot_fp)/2
            Cov_tp += pos[i]
            #print('AUC_else = ', AUC)
            
    return AUC

def auc(df, correctlabels):    
    class_frequency = dict(pd.Series(correctlabels).value_counts(normalize = True))   
    #print('class_frequency', class_frequency)
    AUC = 0
    #print(df.columns)
    #test_labels_unique = pd.Series(test_labels).value_counts(normalize = True)
    #print('correctlabels=', class_frequency.keys())
    for col in df.columns:
        if(col in class_frequency.keys()):
            #print('col=', col)
            predictions_df = pd.DataFrame(df[col], columns=[col])
            #list_reversed_tpr_fpr = get_tpr_fpr(prediction_vector, correctlabels, col)
            pos, neg = count_tp_fp(predictions_df, correctlabels)
            #area_col = calculate_AUC(list_reversed_tpr_fpr)
            area_col = calculate_AUC_Henrik_2(pos, neg)            
            #print('col={}__area_col={}'.format(col, area_col))
            AUC += class_frequency[col] * area_col        
    return AUC

In [5]:
# Define the class kNN with three functions __init__, fit and predict (after the comments):
#
# Input to __init__: 
# self - the object itself
#
# Output from __init__:
# <nothing>
# 
# This function does not return anything but just initializes the following attributes of the object (self) to None:
# column_filter, imputation, normalization, one_hot, labels, training_labels, training_data, training_time
#
# Input to fit:
# self              - the object itself
# df                - a dataframe (where the column names "CLASS" and "ID" have special meaning)
# normalizationtype - "minmax" (default) or "zscore"
#
# Output from fit:
# <nothing>
#
# The result of applying this function should be:
#
# self.column_filter   - a column filter (see Assignment 1) from df
# self.imputation      - an imputation mapping (see Assignment 1) from df
# self.normalization   - a normalization mapping (see Assignment 1), using normalizationtype from the imputed df
# self.one_hot         - a one-hot mapping (see Assignment 1)
# self.training_labels - a pandas series corresponding to the "CLASS" column, set to be of type "category" 
# self.labels          - a list of the categories (class labels) of the previous series
# self.training_data   - the values (an ndarray) of the transformed dataframe, i.e., after employing imputation, 
#                        normalization, and possibly one-hot encoding, and also after removing the "CLASS" and "ID" columns
#
# Note that the function does not return anything but just assigns values to the attributes of the object.
#
# Input to predict:
# self - the object itself
# df   - a dataframe
# k    - an integer >= 1 (default = 5)
# 
# Output from predict:
# predictions - a dataframe with class labels as column names and the rows corresponding to
#               predictions with estimated class probabilities for each row in df, where the class probabilities
#               are estimated by the relative class frequencies in the set of class labels from the k nearest 
#               (with respect to Euclidean distance) neighbors in training_data
#
# Hint 1: Drop any "CLASS" and "ID" columns first and then apply column filtering, imputation, normalization and one-hot
#
# Hint 2: Get the numerical values (as an ndarray) from the resulting dataframe and iterate over the rows 
#         calling some sub-function, e.g., get_nearest_neighbor_predictions(x_test,k), which for a test row
#         (numerical input feature values) finds the k nearest neighbors and calculate the class probabilities.
#
# Hint 3: This sub-function may first find the distances to all training instances, e.g., pairs consisting of
#         training instance index and distance, and then sort them according to distance, and then (using the indexes
#         of the k closest instances) find the corresponding labels and calculate the relative class frequencies

In [6]:
class kNN:
    
    def __init__(self):
        self.column_filter = None
        self.imputation = None
        self.normalization = None
        self.one_hot = None
        self.labels = None
        self.training_labels = None
        self.training_data = None
        self.training_time = None
        
    def fit(self, dataframe, normalizationtype="minmax"):
        df = dataframe.copy()
        df, self.column_filter = create_column_filter(df)
        df, self.imputation = create_imputation(df)
        df, self.normalization = create_normalization(df, normalizationtype)
        df, self.one_hot = create_one_hot(df)
        df["CLASS"] = df["CLASS"].astype("category")
        self.training_labels = df["CLASS"]
        self.labels = list(df["CLASS"].cat.categories)
        self.training_data = df.drop(columns=["ID","CLASS"], errors='ignore')
    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum(np.power(x1 - x2, 2), axis=0))
    
    def get_nearest_neighbor_predictions(self, x_test, k):
        neighbours = []
        distance = []
        data = pd.DataFrame(self.training_data)
        rows = data.shape[0]
        
        for i in range(rows):
            tmp_distance = 0.0
            values = np.array(data.iloc[i, :].values)
            tmp_distance = self.euclidean_distance(values, x_test)
            distance.append((i, tmp_distance))
            
        distance.sort(key=lambda x : x[1])
        
        for i in range(k):
            neighbours.append(distance[i][0])
            
        return neighbours
    
    def get_prob(self, kNN_neighbours):
        k = len(kNN_neighbours)
        training_labels, labels = self.training_labels, self.labels
        labels_prob = np.zeros(len(labels))
        
        for i in range(k):
            for j in range(len(labels)):
                if training_labels[kNN_neighbours[i]] == labels[j]:
                    labels_prob[j] += 1       
            prob = labels_prob/k

        return (prob)
    
    def predict(self, dataframe, k):
        df = dataframe.copy()
        df.drop(columns=["ID", "CLASS"], inplace=True)
        df = apply_normalization(df, self.normalization)
        df = apply_imputation(df, self.imputation)
        df = apply_one_hot(df, self.one_hot)
        predictions = np.zeros((df.shape[0], len(self.labels)))
        for i in range(df.shape[0]):
            values = np.array(df.iloc[i, :].values)
            neighbours = self.get_nearest_neighbor_predictions(values, k)
            prob = self.get_prob(neighbours)
            predictions[i] = prob
        
        predictions_df = pd.DataFrame(predictions, columns=self.labels)
        return predictions_df
    

In [7]:
glass_train_df = pd.read_csv("data/glass_train.csv")

glass_test_df = pd.read_csv("data/glass_test.csv")

knn_model = kNN()

t0 = time.perf_counter()
knn_model.fit(glass_train_df)
print("Training time: {0:.2f} s.".format(time.perf_counter()-t0))

test_labels = glass_test_df["CLASS"]

k_values = [1,3,5,7,9]
results = np.empty((len(k_values),3))

for i in range(len(k_values)):
    t0 = time.perf_counter()
    predictions = knn_model.predict(glass_test_df,k=k_values[i])
    print("Testing time (k={0}): {1:.2f} s.".format(k_values[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=k_values,columns=["Accuracy","Brier score","AUC"])

print()
display("results",results)

minmax mode


Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,202,1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,7
1,124,1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,2
2,152,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,3
3,197,1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,7
4,144,1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,2
...,...,...,...,...,...,...,...,...,...,...,...
102,178,1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,6
103,160,1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,3
104,88,1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,2
105,98,1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,2


Training time: 0.08 s.
Testing time (k=1): 1.06 s.


0      2
1      1
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    2
Length: 107, dtype: int64

Testing time (k=3): 1.03 s.


0      1
1      1
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    2
Length: 107, dtype: int64

Testing time (k=5): 1.02 s.


0      2
1      1
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    2
Length: 107, dtype: int64

Testing time (k=7): 1.05 s.


0      1
1      1
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    2
Length: 107, dtype: int64

Testing time (k=9): 1.25 s.


0      1
1      1
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    2
Length: 107, dtype: int64




'results'

Unnamed: 0,Accuracy,Brier score,AUC
1,0.747664,0.504673,0.81035
3,0.663551,0.488058,0.815859
5,0.579439,0.474019,0.833805
7,0.598131,0.470723,0.834465
9,0.616822,0.483674,0.828734


In [8]:
# Define the class NaiveBayes with three functions __init__, fit and predict (after the comments):
#
# Input to __init__: 
# self - the object itself
#
# Output from __init__:
# <nothing>
# 
# This function does not return anything but just initializes the following attributes of the object (self) to None:
# column_filter, binning, labels, class_priors, feature_class_value_counts, feature_class_counts
#
# Input to fit:
# self    - the object itself
# df      - a dataframe (where the column names "CLASS" and "ID" have special meaning)
# nobins  - no. of bins (default = 10)
# bintype - either "equal-width" (default) or "equal-size" 
#
# Output from fit:
# <nothing>
#
# The result of applying this function should be:
#
# self.column_filter              - a column filter (see Assignment 1) from df
# self.binning                    - a discretization mapping (see Assignment 1) from df
# self.class_priors               - a mapping (dictionary) from the labels (categories) of the "CLASS" column of df,
#                                   to the relative frequencies of the labels
# self.labels                     - a list of the categories (class labels) of the "CLASS" column of df
# self.feature_class_value_counts - a mapping from the feature (column name) to the number of
#                                   training instances with a specific combination of (non-missing, categorical) 
#                                   value for the feature and class label
# self.feature_class_counts       - a mapping from the feature (column name) to the number of
#                                   training instances with a specific class label and some (non-missing, categorical) 
#                                   value for the feature
#
# Note that the function does not return anything but just assigns values to the attributes of the object.
#
# Input to predict:
# self - the object itself
# df   - a dataframe
# 
# Output from predict:
# predictions - a dataframe with class labels as column names and the rows corresponding to
#               predictions with estimated class probabilities for each row in df, where the class probabilities
#               are estimated by the naive approximation of Bayes rule (see lecture slides)
#
# Hint 1: First apply the column filter and discretization
#
# Hint 2: Iterating over either columns or rows, and for each possible class label, calculate the relative
#         frequency of the observed feature value given the class (using feature_class_value_counts and 
#         feature_class_counts) 
#
# Hint 3: Calculate the non-normalized estimated class probabilities by multiplying the class priors to the
#         product of the relative frequencies
#
# Hint 4: Normalize the probabilities by dividing by the sum of the non-normalized probabilities; in case
#         this sum is zero, then set the probabilities to the class priors

In [9]:
# class NaiveBayes:
    
#     def __init__(self):
#         self.column_filter = None
#         self.binning = None
#         self.labels = None
#         self.class_priors = None
#         self.feature_class_value_counts = None
#         self.feature_class_counts = None
        
#     def fit(self, dataframe, nobins, bintype="equal-width"):
#         df = dataframe.copy()
#         df, self.binning = create_bins(df,nobins,bintype)
#         df["CLASS"] = df["CLASS"].astype("category")
#         self.labels = list(df["CLASS"].cat.categories)
#         self.class_priors = dict(df["CLASS"].value_counts(normalize = True))
        
#         feature_class_value_counts = {} # a mapping from a col to a dictionary((c,v),num of this combonation)
#         feature_class_counts = {}  # a mapping from a col to a dictionary (c, num of instances)
        
#         for col in df.columns:
#             if col not in ["CLASS", "ID"]:
#                 # drop rows which contain missing value
#                 df_temp = df.dropna(axis=0, how="any", subset=["CLASS", col])
#                 feature_class_counts[col] = dict(df_temp["CLASS"].value_counts())
#                 group = df_temp.groupby(["CLASS", col]).size()
#                 feature_class_value_counts[col] = dict(group)
                
#         ######################################
#         print("feature_class_counts")
#         display(feature_class_counts)
#         print("feature_class_value_counts")
#         display(feature_class_value_counts)
#         ######################################
        
#         self.feature_class_counts = feature_class_counts
#         self.feature_class_value_counts = feature_class_value_counts
        
        
#     def predict(self, dataframe):
#         df = dataframe.copy()
#         df = apply_bins(df, self.binning)
#         labels = self.class_labels
#         df.drop(columns=["ID","CLASS"],inplace=True)
        
#         nrow, ncol, nlabel = df.shape[0], df.shape[1], len(labels)
#         matrix = np.zeros([nlabel, nrow, ncol])

#         for col_num in range(ncol):
#             col = df.columns[col_num]
            
#             for label_num in range(nlabel):
#                 label = labels[label_num]

#                 for row_num in range(nrow):
#                     value = df.iloc[row_num, col_num]
#                     if((label, value) in self.feature_class_value_counts[col].keys()):
#                         features_value_count = self.feature_class_value_counts[col][(label, value)]
#                         feature_count = self.feature_class_counts[col][label]
#                         relative_freq = features_value_count / feature_count
#                     else:
#                         relative_freq = 0
                    
#                     matrix[label_num, row_num, col_num] = relative_freq
        
#         product = matrix.prod(axis=2)
        
#         prior = np.array([self.class_priors[labels[i]] for i in range(nlabel)])
#         #print(prior)
#         prior = np.tile(prior, nrow).reshape(nrow,nlabel).T  # notice the shape 
#         prob =  product * prior 
        
#         sum_prob = prob.sum(axis=0)
#         sum_zero = sum_prob==0.0
#         sum_prob += sum_zero.astype('float')
        
#         norm_prob = prob/sum_prob
        
#         predictions = pd.DataFrame(norm_prob.T, columns = labels)
        
#         return predictions

In [63]:
class NaiveBayes:
    
    def __init__(self):
        
        self.column_filter = None
        self.binning = None
        self.class_priors = None
        self.feature_class_value_counts = None
        self.feature_class_counts = None
        self.class_labels = None
        
    def fit(self, dataframe, nobins=10, bintype="equal-width"):
        df = dataframe.copy()
        df, self.column_filter = create_column_filter(df)
        df, self.binning = create_bins(df,nobins,bintype)
        df["CLASS"] = df["CLASS"].astype("category")
        
        # 拿到所有的 labels
        self.class_labels = list(df["CLASS"].cat.categories)
        
#         print("class_labels")
#         print(self.class_labels)
        
        # 将 labels 归一化, 统计出现的频率
        value_counts = df["CLASS"].value_counts(normalize = True)
        
#         print("value_counts")
#         print(value_counts)
#         print(type(value_counts))
        
#         self.class_priors = dict(df["CLASS"].value_counts(normalize = True))
        self.class_priors = value_counts.to_dict()
#         print("class_priors")
#         print(self.class_priors)
#         print("test")
        
        feature_class_value_counts = {}
        feature_class_counts = {}
 
        for col in df.columns:
            # 如果 col 是一个特征
            if col not in ["CLASS", "ID"]:
                # 删除 ["CLASS", col] 中包含 NaN 数据的行
                df_temp = df.dropna(axis= 0, how="any", subset=["CLASS", col])
#                 feature_class_counts[col] = dict(df_temp["CLASS"].value_counts())
                feature_class_counts[col] = df_temp["CLASS"].value_counts().to_dict()
                g = df_temp.groupby(["CLASS", col]).size()
                feature_class_value_counts[col] = dict(g)
        
        # feature_class_counts
        # {'RI': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Na': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Mg': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Al': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Si': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'K': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Ca': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Ba': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}, 
        # 'Fe': {2: 34, 1: 31, 7: 20, 3: 8, 5: 8, 6: 6}}    
#         print("feature_class_value_counts")
#         print(feature_class_value_counts)
#         print("feature_class_counts")
#         print(feature_class_counts)
        
        self.feature_class_counts = feature_class_counts
        self.feature_class_value_counts = feature_class_value_counts
        
    def predict(self, dataframe):
        df = dataframe.copy()
        df = apply_bins(df, self.binning)
        labels = self.class_labels
        df.drop(columns=["ID","CLASS"],inplace=True)
        nrow, ncol, nlabel = df.shape[0], df.shape[1], len(labels)
        matrix = np.zeros([nlabel, nrow, ncol])
#         print("matrix")
#         print(matrix)
        for col_num in range(ncol):
            col = df.columns[col_num]
#             print("col")
#             print(col)
            for label_num in range(nlabel):
                label = labels[label_num]
            
                for row_num in range(nrow):
                    value = df.iloc[row_num, col_num]
#                     print("value")
#                     print(value)
#                     print("====")
#                     print(self.feature_class_value_counts[col].keys())
                    if((label, value) in self.feature_class_value_counts[col].keys()):
                        features_value_count = self.feature_class_value_counts[col][(label, value)]
#                         print("features_value_count")
#                         print(features_value_count)
                        feature_count = self.feature_class_counts[col][label]
                        relative_freq = features_value_count / feature_count
                    else:
                        relative_freq = 0
                    
                    matrix[label_num, row_num, col_num] = relative_freq
#         print(matrix)
        product = matrix.prod(axis=2)
#         print(product)
        prior = np.array([self.class_priors[labels[i]] for i in range(nlabel)])
        #print(prior)
        prior = np.tile(prior, nrow).reshape(nrow,nlabel).T  # notice the shape 
        prob =  product * prior 
        
        sum_prob = prob.sum(axis=0)
        sum_zero = sum_prob==0.0
        sum_prob += sum_zero.astype('float')
        
        norm_prob = prob/sum_prob
        
        predictions = pd.DataFrame(norm_prob.T, columns = labels)
        
        return predictions
        
                

In [64]:
glass_train_df = pd.read_csv("data/glass_train.csv")

glass_test_df = pd.read_csv("data/glass_test.csv")

nb_model = NaiveBayes()

test_labels = glass_test_df["CLASS"]

nobins_values = [3,5,10]
bintype_values = ["equal-width","equal-size"]
parameters = [(nobins,bintype) for nobins in nobins_values for bintype in bintype_values]

results = np.empty((len(parameters),3))

for i in range(len(parameters)):
    t0 = time.perf_counter()
    nb_model.fit(glass_train_df,nobins=parameters[i][0],bintype=parameters[i][1])
    print("Training time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    t0 = time.perf_counter()
    predictions = nb_model.predict(glass_test_df)
    print("Testing time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
  
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=pd.MultiIndex.from_product([nobins_values,bintype_values]),
                       columns=["Accuracy","Brier score","AUC"])
results

Training time (3, 'equal-width'): 0.10 s.
Testing time (3, 'equal-width'): 0.19 s.


0      1
1      2
2      1
3      1
4      2
      ..
102    2
103    2
104    2
105    2
106    5
Length: 107, dtype: int64

Training time (3, 'equal-size'): 0.07 s.
Testing time (3, 'equal-size'): 0.19 s.


0      2
1      6
2      1
3      1
4      2
      ..
102    2
103    2
104    2
105    5
106    2
Length: 107, dtype: int64

Training time (5, 'equal-width'): 0.07 s.
Testing time (5, 'equal-width'): 0.20 s.


0      2
1      2
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    1
Length: 107, dtype: int64

Training time (5, 'equal-size'): 0.13 s.
Testing time (5, 'equal-size'): 0.23 s.


0      2
1      1
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    2
Length: 107, dtype: int64

Training time (10, 'equal-width'): 0.11 s.
Testing time (10, 'equal-width'): 0.23 s.


0      2
1      1
2      3
3      1
4      2
      ..
102    2
103    2
104    1
105    1
106    1
Length: 107, dtype: int64

Training time (10, 'equal-size'): 0.08 s.
Testing time (10, 'equal-size'): 0.24 s.


0      2
1      2
2      1
3      1
4      2
      ..
102    2
103    2
104    1
105    2
106    7
Length: 107, dtype: int64

In [12]:
train_labels = glass_train_df["CLASS"]
nb_model.fit(glass_train_df)
predictions = nb_model.predict(glass_train_df)
print("Accuracy on training set: {0:.2f}".format(accuracy(predictions,train_labels)))
print("AUC on training set: {0:.2f}".format(auc(predictions,train_labels)))
print("Brier score on training set: {0:.2f}".format(brier_score(predictions,train_labels)))

0      7
1      2
2      3
3      7
4      2
      ..
102    6
103    3
104    2
105    2
106    1
Length: 107, dtype: int64

Accuracy on training set: 0.85
AUC on training set: 0.97
Brier score on training set: 0.23
