# Naive Bayes Classifier



In [None]:
# Import packages

import pandas as pd
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# This function prepares the data by reading it from a file and converting it 
# into a useful format for training and testing

def preprocess():
    
    # Read the files
    trainset = pd.read_csv("train.csv", header=None)
    testset = pd.read_csv("test.csv", header=None)
    
    # Identify 9999 value as Null
    trainset = trainset.replace(9999, np.NaN)
    testset = testset.replace(9999, np.NaN)
    
    # Calculate medians for all pose categories
    medians = trainset.groupby(trainset[0]).median()

    # Median imputation for null value
    for i in range(trainset.shape[0]):
      for j in range(1, trainset.shape[1]):
        if (np.isnan(trainset.loc[i, j])):
            trainset.loc[i, j] = medians.loc[trainset.loc[i, 0], j]

    return [trainset, testset]

trainset = preprocess()[0]
testset = preprocess()[1]
print(trainset)

In [5]:
# This function calculates prior probabilities and likelihoods from the training data and using
# them to build a naive Bayes model

def train(trainset):
    
    # Extract all pose categories
    categories = list(trainset[0].drop_duplicates())
    instance_num = trainset.shape[0]

    prior = defaultdict(float)
    mean = pd.DataFrame()
    var = pd.DataFrame()

    # For each data point in each category, calculate its mean and variance
    for category in categories:
        curr_category = trainset[trainset[0] == category]
        prior[category] = curr_category.shape[0] / instance_num
        mean[category] = curr_category.iloc[:, 1: 23].mean()
        var[category] = curr_category.iloc[:, 1: 23].var()
    
    train_model = [prior, mean, var]
    return train_model

train_model = train(trainset)

In [6]:
# This function calculates the corresponding y value in a normal distribution
# given mean, variance and x value

def calc_normal(mean, var, x):
    coefficient = 1 / np.sqrt(2 * math.pi * var) 
    y =  coefficient * np.exp(- np.power((x - mean), 2) / (2 * var))

    return np.log(y, where = y > 0)

In [7]:
# This function predicts classes for new items in a test dataset (for the purposes of this assignment, you
# can re-use the training data as a test set)

def predict(testset, train_model):
    [prior, mean, var] = train_model
    predict_result = []

    for i in range(testset.shape[0]):
        test = testset.iloc[i, 1:].astype(float)
        result = pd.Series(dtype='float64')
        max_score = 0
        
        # Calculate the score for each category
        for category in train_model[0].keys():
            result[category] = 0
            
            # Only include values that are not null
            for i in range(1, len(test) + 1):
                if not np.isnan(test[i]):
                    result[category] += calc_normal(mean[category][i], var[category][i], test[i])
            
            # Add prior probability
            result[category] += math.log(prior[category])
            
            # Check if the current class scores the highest
            if max_score == 0 or result[category] > max_score:
                max_score = result[category]
                result_category = category
        
        # Append the result to the result list
        predict_result.append(result_category)
        
    return predict_result

predict_result = predict(testset, train_model)

In [8]:
# This function should evaliate the prediction performance by comparing your model’s class outputs to ground
# truth labels

def evaluate(predict_result, testset):
    correct_num = 0
    correct_result = testset.iloc[:, 0].tolist()
    
    # Calculate how much times the classifier predicts correctly
    for i in range(len(predict_result)):
        if (correct_result[i] == predict_result[i]):
            correct_num += 1

    # Return the accuracy
    return correct_num / len(predict_result)

accuracy = evaluate(predict_result, testset)
print("Overall accuracy: " + str(accuracy))

Overall accuracy: 0.7413793103448276


## Questions 


If you are in a group of 1, you will respond to **two** questions of your choosing.

If you are in a group of 2, you will respond to **four** questions of your choosing.

A response to a question should take about 100–250 words, and make reference to the data wherever possible.

#### NOTE: you may develope codes or functions to help respond to the question here, but your formal answer should be submitted separately as a PDF.

### Q1
Since this is a multiclass classification problem, there are multiple ways to compute precision, recall, and F-score for this classifier. Implement at least two of the methods from the "Model Evaluation" lecture and discuss any differences between them. (The implementation should be your own and should not just call a pre-existing function.)

In [9]:
# This function will return a dictionary of results containing the values of 
# accuracy, precision, recall and F-score for each class respectively

def calc_result(predict, correct, all_classes):
    class_weight = {}
    basic_result = {}
    computed_result = {}
    
    # Calculate statistics for each class respectively
    for curr_class in all_classes:
        tp = tn = fp = fn = number = 0
        
        # Check each predict result and classify into tp, fn, fp, tn
        for i in range(len(predict)):
            if (correct[i] == curr_class):
                number += 1
            if (correct[i] == curr_class and predict[i] == curr_class):
                tp += 1
            elif (correct[i] == curr_class and predict[i] != curr_class):
                fn += 1
            elif (correct[i] != curr_class and predict[i] == curr_class):
                fp += 1
            else:
                tn += 1
        
        class_weight[curr_class] = number
        basic_result[curr_class] = [tp, tn, fp, fn]
        
        # Calculate accuracy, precision, recall, F-score
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f_score = 2 * precision * recall / (precision + recall)
        computed_result[curr_class] = [accuracy, precision, recall, f_score]

    return [class_weight, basic_result, computed_result]

result = calc_result(predict_result, testset.iloc[:, 0].tolist(), train_model[0].keys())
class_weight = result[0]
basic_result = result[1]
computed_result = result[2]

In [10]:
# Plot basic results

column_names = ["tp", "tn", "fp", "fn"]
row_names = list(train_model[0].keys())
results = list(basic_result.values())

tab = plt.table(cellText = results, colLabels = column_names, rowLabels = row_names,
              loc = 'center', cellLoc = 'center', rowLoc = 'center')
plt.axis('off')
plt.savefig("basic_results.png", dpi = 500, bbox_inches = 'tight')
plt.close('all')

In [11]:
# Plot calculated results

column_names = ["Accuracy", "Precision", "Recall", "F-score"]
row_names = list(train_model[0].keys())
results = list(computed_result.values())

tab = plt.table(cellText = results, colLabels = column_names, rowLabels = row_names,
              loc = 'center', cellLoc = 'center', rowLoc = 'center')
plt.axis('off')
plt.savefig("computed_results.png", dpi = 500, bbox_inches = 'tight')
plt.close('all')

In [12]:
# Macro-averaging

macro_precision = macro_recall = 0
for curr_class in computed_result.keys():
    macro_precision += computed_result[curr_class][1]
    macro_recall += computed_result[curr_class][2]

macro_precision /= len(computed_result.keys())
macro_recall /= len(computed_result.keys())
macro_f_score = 2 * macro_precision * macro_recall / (macro_precision + macro_recall)
macro_result = [macro_precision, macro_recall, macro_f_score]

In [13]:
# Micro-averaging

total_tp = total_tn = total_fp = total_fn = 0
for curr_class in basic_result.keys():
    total_tp += basic_result[curr_class][0]
    total_tn += basic_result[curr_class][1]
    total_fp += basic_result[curr_class][2]
    total_fn += basic_result[curr_class][3]

micro_precision = total_tp / (total_tp + total_fp)
micro_recall = total_tp / (total_tp + total_fn)
micro_f_score = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
micro_result = [micro_precision, micro_recall, micro_f_score]

In [14]:
# Weighted averaging
weighted_precision = weighted_recall = 0
for curr_class in computed_result.keys():
    weighted_precision += computed_result[curr_class][1] * (class_weight[curr_class] / testset.shape[0])
    weighted_recall += computed_result[curr_class][2] * (class_weight[curr_class] / testset.shape[0])
weighted_f_score = 2 * weighted_precision * weighted_recall / (weighted_precision + weighted_recall)

weighted_result = [weighted_precision, weighted_recall, weighted_f_score]

In [15]:
# Plot overall precision and recall

x = np.arange(3)
precision = [macro_result[0], micro_result[0], weighted_result[0]]
recall = [macro_result[1], micro_result[1], weighted_result[1]]

bar_width = 0.35
tick_label = ["Macro", "Micro", "Weighted"]
plt.bar(x, precision, bar_width, color="c", align="center", label="Precision", alpha=0.5)
plt.bar(x + bar_width, recall, bar_width, color="b", align="center", label="Recall", alpha=0.5)
plt.ylim(0, 1.2)

plt.title("Precision and Recall based on Three Calculation Methods")
plt.xlabel("Different Methods to compute statistics")
plt.ylabel("Results")

plt.xticks(x + bar_width / 2, tick_label)
plt.legend()

plt.savefig("pre_recall.png", dpi = 500, bbox_inches = 'tight')
plt.close('all')

In [16]:
# Plot overall F-score

f_score = [macro_result[2], micro_result[2], weighted_result[2]]
plt.bar(x, f_score, bar_width, color='r', align="center")
plt.xticks(x, tick_label)
plt.ylim(0, 1.2)

plt.title("F-score based on Three Calculation Methods")
plt.xlabel("Different Methods to compute statistics")
plt.ylabel("Results")

plt.savefig("f_score.png", dpi = 500, bbox_inches = 'tight')
plt.close('all')

### Q2
The Gaussian naıve Bayes classifier assumes that numeric attributes come from a Gaussian distribution. Is this assumption always true for the numeric attributes in this dataset? Identify some cases where the Gaussian assumption is violated and describe any evidence (or lack thereof) that this has some effect on the classifier’s predictions.

### Q3
Implement a kernel density estimate (KDE) naive Bayes classifier and compare its performance to the Gaussian naive Bayes classifier. Recall that KDE has kernel bandwidth as a free parameter -- you can choose an arbitrary value for this, but a value in the range 5-25 is recommended. Discuss any differences you observe between the Gaussian and KDE naive Bayes classifiers. (As with the Gaussian naive Bayes, this KDE naive Bayes implementation should be your own and should not just call a pre-existing function.)

In [17]:
# This function calculates the normal pdf value

def normal_kernel(x, mean, sigma):
    
    # Handle missing value
    if (x == -2000 or mean == -2000):
        return 1e-40
    
    return (math.e ** (-0.5 * (((x - mean) / sigma) ** 2))) / (math.sqrt(2 * math.pi) * sigma)

In [18]:
# This function uses the trainset to develop the kde prediction model

def train_kde(categories, trainset):
    model = dict()
    for category in categories:
        
        # Calculate prior probability
        prior = trainset.groupby(0).size()[category] / len(trainset)
        
        # Extract all items
        model[category] = np.array([prior, trainset[trainset[0] == category].iloc[:,1:].to_numpy()], dtype=object)

    return model

In [19]:
# This function predicts testset pose based on kde naive bayes

def predict_kde(trainset, testset, h):
    
    # Extract all categories
    categories = np.unique(trainset[0])

    # Train kde model
    kde_model = train_kde(categories, trainset)
    
    result = []

    # Predict for each instance in testset
    for instance in testset:    
        probs = dict()

        # Calculate probability for each pose
        for category in categories:
            
            # Extracrt all instances from trainset
            samples = kde_model[category][1]

            # Add prior probability
            prob_sum = np.log(kde_model[category][0],
                              where = kde_model[category][0] > 0)
            
            # Calculate probability for each attributes
            for i in range(len(instance)):
                length = 0
                score = 0
                test_atr = instance[i]
                train_atr = samples[:, i]
                
                # Calculate probability for each attributes for each instance
                for one_atr in train_atr:
                    # Skip missing values
                    if (one_atr != -2000):
                        length += 1
                    score += normal_kernel(test_atr, one_atr, h)
                
                # Smoothing
                if score == 0: score = 1e-40
                if length == 0: length = 1
                score /= length 
                prob_sum += np.log(score, where = score > 0)
                
            probs[category] = prob_sum

        predict = max(probs, key = probs.get)
        result.append(predict)
    return result

In [20]:
# Main section for Q3
trainset = pd.read_csv("train.csv", header=None)
testset = pd.read_csv("test.csv", header=None)

# Use -2000 to represent missing value
trainset = trainset.replace(9999, -2000)
testset = testset.replace(9999, -2000)

# Arbitrary bandwidth is chosen
h = 10

# Predict and evaluate
predict_result = predict_kde(trainset, testset.iloc[:,1:23].to_numpy(), h)
accuracy = evaluate(predict_result, testset)
print("Overall accuracy: " + str(accuracy))

Overall accuracy: 0.7758620689655172


### Q4
Instead of using an arbitrary kernel bandwidth for the KDE naive Bayes classifier, use random hold-out or cross-validation to choose the kernel bandwidth. Discuss how this changes the model performance compared to using an arbitrary kernel bandwidth.

In [37]:
# It will take several minutes to run this block
from sklearn.model_selection import KFold

trainset = pd.read_csv("train.csv", header=None)
testset = pd.read_csv("test.csv", header=None)

# Shuffle the trainset
trainset = trainset.sample(frac=1, random_state = 80).reset_index(drop = True)

# Accuracies list for all bandwidth
accuracies = []

# Each round use 80% of trainset as train and 20% as test
kf = KFold(n_splits = 5)

for h in range(5, 26):
    total_accuracy = 0
    
    # Cross-validation for 5 folds
    for train_indices, test_indices in kf.split(trainset):
        
        # Prepare the trainset and testset
        one_train = trainset.iloc[train_indices,: ]
        one_test = trainset.iloc[test_indices,: ]
        one_train = one_train.reset_index(drop=True)
        one_test = one_test.reset_index(drop=True)
        
        # Predict testset
        predict = predict_kde(one_train, one_test.iloc[:,1:23].to_numpy(), h)
        result = evaluate(predict, one_test)
        total_accuracy += result 
    

    ave_accuracy = total_accuracy / 5
    print("For kernel bandwidth h = %d, the average accuracy is %.4f"
     % (h, ave_accuracy))

    accuracies.append(ave_accuracy)
    
print('The optimal kernel bandwidth for current trainset is %d' % (np.argmax(accuracies) + 5))

For kernel bandwidth h = 5, the average accuracy is 0.7751
For kernel bandwidth h = 6, the average accuracy is 0.7724
For kernel bandwidth h = 7, the average accuracy is 0.7791
For kernel bandwidth h = 8, the average accuracy is 0.7778
For kernel bandwidth h = 9, the average accuracy is 0.7858
For kernel bandwidth h = 10, the average accuracy is 0.7791
For kernel bandwidth h = 11, the average accuracy is 0.7764
For kernel bandwidth h = 12, the average accuracy is 0.7778
For kernel bandwidth h = 13, the average accuracy is 0.7791
For kernel bandwidth h = 14, the average accuracy is 0.7737
For kernel bandwidth h = 15, the average accuracy is 0.7724
For kernel bandwidth h = 16, the average accuracy is 0.7724
For kernel bandwidth h = 17, the average accuracy is 0.7684
For kernel bandwidth h = 18, the average accuracy is 0.7657
For kernel bandwidth h = 19, the average accuracy is 0.7671
For kernel bandwidth h = 20, the average accuracy is 0.7630
For kernel bandwidth h = 21, the average accu

### Q5
Naive Bayes ignores missing values, but in pose recognition tasks the missing values can be informative. Missing values indicate that some part of the body was obscured and sometimes this is relevant to the pose (e.g., holding one hand behind the back). Are missing values useful for this task? Implement a method that incorporates information about missing values and demonstrate whether it changes the classification results.

In [33]:
# Calculate the number of missing values in different classes

trainset = pd.read_csv("train.csv", header=None)
testset = pd.read_csv("test.csv", header=None)
trainset = trainset.replace(9999, np.NaN)
testset = testset.replace(9999, np.NaN)
class_num = defaultdict(int)
nan_num = defaultdict(int)

for i in range(trainset.shape[0]):
    class_num[trainset.iloc[i, 0]] += 1
    curr_instance = trainset.iloc[i, 1:].astype(float)
    for data in curr_instance:
        if np.isnan(data):
            nan_num[trainset.iloc[i, 0]] += 1

In [34]:
# Separate all classes into two categories, i.e. less_missing and more_missing

less_missing = []
more_missing = []

for curr_class in class_num.keys():
    class_num[curr_class] = nan_num[curr_class] / class_num[curr_class]
    if class_num[curr_class] > 2:
        more_missing.append(curr_class)
    else:
        less_missing.append(curr_class)

In [35]:
# Update the predict function

def predict_miss(testset, train_model):
    
    # Identify 9999 value as Null
    testset = testset.replace(9999, np.NaN)
    [prior, mean, var] = train_model
    predict_result = []

    for i in range(testset.shape[0]):
        test = testset.iloc[i, 1:].astype(float)
        result = pd.Series(dtype='float64')

        # Find category
        nan_num = 0
        for i in range(1, len(test) + 1):
            if np.isnan(test[i]):
                nan_num += 1
        if nan_num > 2:
            class_group = more_missing
        else:
            class_group = less_missing

        # Find the class that has the highest score
        max_score = 0
        for category in class_group:
            result[category] = 0
            
            # Only include values that are not null
            for i in range(1, len(test) + 1):
                if not np.isnan(test[i]):
                    result[category] += calc_normal(mean[category][i], var[category][i], test[i])
            result[category] += math.log(prior[category])
            
            # Check if the current class scores the highest
            if max_score == 0 or result[category] > max_score:
                max_score = result[category]
                result_category = category

        predict_result.append(result_category)
        
    return predict_result

predict_result = predict_miss(testset, train_model)

In [36]:
accuracy = evaluate(predict_result, testset)
print("Overall accuracy: " + str(accuracy))

Overall accuracy: 0.7068965517241379


### Q6
Engineer your own pose features from the provided keypoints. Instead of using the (x,y) positions of keypoints, you might consider the angles of the limbs or body, or the distances between pairs of keypoints. How does a naive Bayes classifier based on your engineered features compare to the classifier using (x,y) values? Please note that we are interested in explainable features for pose recognition, so simply putting the (x,y) values in a neural network or similar to get an arbitrary embedding will not receive full credit for this question. You should be able to explain the rationale behind your proposed features. Also, don't forget the conditional independence assumption of naive Bayes when proposing new features -- a large set of highly-correlated features may not work well.