# Gaussian(Multivariate Normal) Bayes Classifier

## Import Necessary Modules

In [1]:
import numpy as np
import os
import math
import random

## Load Data to Matrix and Map it to Numeric Values

In [2]:
# Load the file and convert it to numpy matrix.

# Load necessary modules
import numpy as np
import csv



def load_text_data(file, delimiter, header, label_col_is_str , label_col):
    """
    Loading the text data and convert it to numpy matrix.
    If there is a label column with string data type, it will convert the label to Intiger and replace the string. And also return
    the mapping of the string mapping in a dictionary.
    
    Input:
    file: file name
    delimiter: "," or "\t" etc.
    header : True / False
    label_col_is_str: True/ False
    label_col : the column index of the label.
    
    Output:
    
    Data: in numpy matrix.
    label_map: in dictionary.
    
    """
    
    text_file = open(file, "rt")
    reader = csv.reader(text_file, delimiter = delimiter)
    x = list(reader)
    text_file.close()
    
    # if there is a header, remove it.
    if header:
        x.pop(0)
    Data = np.asmatrix(x)
    
    if label_col_is_str:
        try:
            Data, Label_Map = map_label(Data, label_col)
            return Data, Label_Map
        except ValueError:
            print("The header is string data type. Please specify the header = True in the function.")
            Data = None
    else:
        return Data
            




# mapping of the string labels.


# mapping the label
def map_label(Data, label_column):
    """
    If the label column is string, create a map for the label and convert it to intiger.
    
    Input:
    1. Data: in numpy matrix.
    2. label_column: the column number containing the label.
    
    Output:
    1. Data: in numpy matrix in float data type.
    """
    Label_Map = {}
    
    Label = np.unique(np.array(Data[:, label_column]))
    
    for i in range(len(Label)):
        Label_Map[Label[i]] = i
    
    mapping = lambda label, label_map : label_map[label]
    
    Data[:, label_column] = np.vectorize(mapping)(Data[:, label_column], Label_Map)
    
    Data = Data.astype(float) 
    
    return Data, Label_Map

## Split the Data to Train and Test

In [3]:
# train-test data
class split_train_test:

    
    
    def __init__(self, data, percentage, label):
        '''
        Input:
        data: a numpy matrix.
        percentage: percentage of samples for the training. percentage must be greater than 0 and less than 100.
        label: column index of the label.
        
        
        '''

        # import necessary module

        import numpy as np

        # make an one dimensional matrix n*1 to a vecto

        make_vector = lambda X : np.squeeze(np.asarray(X))


        if percentage <= 0 or percentage >= 100:
            print("The percentage of training data must be greater than 0 and less than 100.")
        else:
            import random;
        
            n = data.shape[0]  # take the number of rows in original data.
            l = int(n*percentage/100)
        
        
            # Take random integers to slice the data
            train_rows = random.sample(range(n), l)
            test_rows  = [i for i in range(n) if i not in train_rows]
            
            # Split the data to train and test.
            self.train = data[train_rows,:]
            self.test  = data[test_rows, :]
            
            # Split both train and test data to feature and label.
            self.train_feature = np.delete(self.train, label, 1)
            self.train_label = make_vector(self.train[:, label])
            
            # Split both train and test data to feature and label.
            self.test_feature = np.delete(self.test, label, 1)
            self.test_label = make_vector(self.test[:, label])
        

## Gaussian(Multivariate Normal) Bayes Classifier Algorithm

In [4]:
# Training of the model.

def train_naive_bayes(train_data, label):
    
    """
    Input:
    1. train_data: the train data with the label column. The data must be in 2d array(not matrix).
    2. label: the column index of the label in the train_data.
    
    Output:
    1. mean: the mean vector of the features in dictionary.
    2. covariance: the covariance of the features in dictionary.
    3. prior_prob : the prior probability of each of the labels dictionary.
    4. class_label : label of the classes.
    
    In output dictionary the keys are the label.
    """

    import numpy as np

    # Storing mean and variances of each of the features of each of the classes in dictionary.
    
    # Train Model.

    # Storing mean and variances of each of the features of each of the classes in dictionary.
    mean = {}
    covariance  = {}
    prior_prob = {}


    
    N = train_data.shape[0]
    class_labels = np.unique(train_data[:, label])
    for i in class_labels:
        X = train_data[train_data[:, label] == i, :] # make seperate matrix for each of the classes.
        X_features = np.delete(X, label, axis = 1)       # delete the class column and take only the features.
        mean[i] = X_features.mean(0)                 # means of the features
        covariance[i]  = np.cov(X_features.T)        # covariances of the features
        prior_prob[i] = X_features.shape[0]/N


            
    return mean, covariance, prior_prob, class_labels
    


# Testing

# Normal Likelihood

def multivariate_gaussian_likelihood(x, mu, sigma):
    
    import numpy as np
    import math
    
    p = len(x)
    
    p1 =  1 / (pow(2 * (math.pi), p/2) * np.linalg.det(sigma))

    X = np.asmatrix(x - mu)
    Y = np.asmatrix(np.linalg.inv(sigma))
    p2 = math.exp(-0.5 * np.dot(np.dot(X,Y), X.T))

    return p1*p2

    

    

# Bayes Numerator

def bayes_numerator(likelihood, prior):
    return likelihood * prior




def gaussian_bayes_classification(test_feature, mean, covariance, prior_prob, class_labels):
    """
    Input:
    1. test_feature: the test features in 2D array(not matrix).
    2. mean: the mean vector of the features in dictionary.(from the training step)
    3. covariance: the covariance of the features in dictionary.(from the training step)
    4. prior_prob : the prior probability of each of the labels dictionary.(from the training step)
    Output:
    The predicted label in list.
    """

    import numpy as np

    pred_label = []

    # Test the data
    for i in range(len(test_feature)):
        x = test_feature[i]

        class_compare = np.empty([len(class_labels), 3])
        class_compare[:] = np.nan
        class_compare[:, 0] = class_labels

        bayes_numerator_all_class = [multivariate_gaussian_likelihood(x, mean[i], covariance[i]) for i in class_labels]

        normalizing_factor = sum(bayes_numerator_all_class)

        posterior_porb = [bayes_numerator_all_class[i]/ normalizing_factor for i in range(len(bayes_numerator_all_class))]

        class_compare[:, 1] = bayes_numerator_all_class
        class_compare[:, 2] = posterior_porb

        

        maximum_posterior =  max(class_compare[:, 2])

        c = class_compare[class_compare[:,2] == maximum_posterior,][0][0]

        pred_label.append(c)
    
    return pred_label

## Performance and Accurace Measurement 

In [5]:
def confusion_matrix(true_label, pred_label, percentage = True):
    """
    Input: 
    1. true_label: The True label.
    2. pred_label: The predicted label.
    3. percentage: If the result wanted in percentage. True/False
    
    Output:
    1. total accuracy
    2. confusion table. True label * Predicted label
    """


    import numpy as np

        


    if len(pred_label) == len(true_label):
        n = len(pred_label)
        accuracy = round((sum(np.asarray(pred_label) == np.asarray(true_label))/n) * 100, 2)
        
        p_l = np.unique(pred_label)
        t_l = np.unique(true_label)
        n = len(p_l)
        confusion_matrix = np.zeros([n, n])

        for i in range(n):
            t = t_l[i]
            for j in range(n):
                p = p_l[j]
                c = sum([pred_label[k] == p and true_label[k] == t for k in range(len(pred_label))])
                confusion_matrix[i,j] = c
        
        if percentage:
            confusion_matrix = np.around((confusion_matrix/len(pred_label)) * 100, decimals = 2)
        
        
        
        return accuracy, confusion_matrix



    else:
        print("Predicted label and True label lengths are not same.")

# Implementation with Iris Dataset

## Step 1: Load the Dataset

In [6]:
os.chdir(r"C:\Users\tahsi\OneDrive - University of Eastern Finland\Python Algorithm and Data Structure\GitHub\ml-algorithms-python-numpy\1. Codes")
Iris, d_map = load_text_data(file = "Iris.txt", delimiter = ",", header = False , label_col_is_str = True , label_col = 4)

In [7]:
# first 5 rows of the data. The 5th column was the label. The label was string. The function mapped it to integer. The map of label
# is stored in the d_map
print(Iris[0:5 ,:])

[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


In [8]:
print(d_map)

{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}


## Step 2: Split the Data to Train and Test

In [9]:
train_test_data = split_train_test(data = Iris, percentage = 50, label = 4)

train_data_feature = train_test_data.train_feature
train_data_label = train_test_data.train_label
test_feature = np.asarray(train_test_data.test_feature) 
true_label = np.asarray(train_test_data.test_label)
train_data_feature_label = np.asarray(train_test_data.train)

## Step 3: Implementation of Gaussian(Multivariate Normal) Bayes Classifier

### Train the Model

In [10]:
train_data = train_data_feature_label
label = 4
mean, covariance, prior_prob, class_labels = train_naive_bayes(train_data, label)

### Testing the Model

In [11]:
predicted_label = gaussian_bayes_classification(test_feature , mean, covariance, prior_prob, class_labels)

## Step 4: Performance Measurement

In [12]:
acuraccy, con_table = confusion_matrix(true_label = true_label, pred_label = predicted_label, percentage = True)

In [13]:
print(acuraccy)

97.33


In [14]:
print(con_table)

[[36.    0.    0.  ]
 [ 0.   26.67  2.67]
 [ 0.    0.   34.67]]
