# TinyML - GaussianNB (Classifier)

##  1 - Hardcode Implementation

### 1.1 - Importing libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import multivariate_normal
from sklearn.metrics import accuracy_score

### 1.2 - Load Dataset

The Iris dataset is a classic dataset in the field of machine learning and statistics. It was introduced by Sir Ronald A. Fisher in 1936 as an example of discriminant analysis. The dataset is often used for educational purposes and is a common starting point for the practice of pattern classification.


Attributes:

- Sepal length (in centimeters)

- Sepal width (in centimeters)

- Petal length (in centimeters)

- Petal width (in centimeters)

Species:

- 0 - Setosa

- 1 - Versicolor

- 2 - Virginica

In [2]:
# Load iris dataset
data = load_iris()

# Create a DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)


# Add target variable to the DataFrame
df['target'] = data.target

# Remove NaN values
df = df.dropna(axis='rows') #remove NaN

# Display the DataFrame
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [3]:
classes = np.array(pd.unique(df[df.columns[-1]]), dtype=int)

In [4]:
classes

array([0, 1, 2])

In [5]:
data = df.to_numpy()
nrow,ncol = df.shape
y = data[:,-1]
X = data[:,0:ncol-1]

### 1.3 - Normalization of the data, in order to avoid the effect of the scale of the attributes.

In [6]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [7]:
print('Processed data:')
print('Mean: ', np.mean(X, axis = 0))
print('STD:', np.std(X, axis = 0))

Processed data:
Mean:  [-4.73695157e-16 -7.81597009e-16 -4.26325641e-16 -4.73695157e-16]
STD: [1. 1. 1. 1.]


### 1.4 - Split into training and test data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

### 1.5 - Bayesian classifier

Let's consider the parametric case, assuming that each variable is distributed according to a Normal distribution. Other distributions can also be used.

We have already selected the training and test sets. In the training set, we will calculate the mean and standard deviation of each attribute for each class. Next, we classify the data using Bayesian decision theory, i.e.: $X \in C_i$ if, and only if, $P(C_i|X) = \max P(C_j|X)$ for all $j$.



First, we define a function to calculate the joint probability density: $$p(\vec{x}|C_i) = \prod_{j=1}^d p(x_j|C_i), \quad i=1,\ldots, k$$ 
where $C_i$ are the classes. If the distribution is normal, each attribute $X_j$ has the following associated probability density function, for each class:
$$
p(x_j|C_i) = \frac{1}{\sqrt{2\pi\sigma_{C_i}}}\exp \left[ -\frac{1}{2}\left( \frac{x_j-\mu_{C_i}}{\sigma_{C_i}}\right)^2 \right], \quad i=1,2,\ldots, k.
$$
Thus, we have defined a function to calculate the likelihood function.

In [9]:
from scipy.stats import multivariate_normal

# P - stores the probability of belonging to each class
P = pd.DataFrame(data=np.zeros((X_test.shape[0], len(classes))), columns = classes) 

# Pc - stores the chances of belonging to each class
Pc = np.zeros(len(classes)) #fraction of elements in each class

# For each "i" being a class
for i in np.arange(0, len(classes)):
    # select the elements of class "i"
    elements = tuple(np.where(y_train == classes[i]))
    # Calculates the probability of belonging to class "i"
    Pc[i] = len(elements)/len(y_train)
    # Selects the elements belonging to class "i" in the training set
    Z = X_train[elements,:][0]
    # Calculates the average of the previously selected values
    m = np.mean(Z, axis = 0)
    # Calculates covariance matrix of previously selected elements
    cv = np.cov(np.transpose(Z))
    # Calculates the chance of belonging to each class for the test set
    for j in np.arange(0,X_test.shape[0]):
        x = X_test[j,:]
        # Likelihood function
        pj = multivariate_normal.pdf(x, mean=m, cov=cv, allow_singular=True)
        P[classes[i]][j] = pj*Pc[i]

In [10]:
print(P)

                0             1             2
0    7.252070e-85  5.399610e-04  1.512414e-05
1    3.797641e-03  1.823911e-30  9.724713e-66
2   9.079165e-258  1.554060e-15  3.121564e-07
3    1.017958e-78  1.665089e-02  4.510617e-05
4   5.954219e-101  3.949526e-03  2.384625e-06
5    5.020517e-03  9.996566e-24  6.535865e-57
6    4.844103e-44  1.078151e-03  1.737460e-10
7   2.558614e-137  6.946614e-14  6.004427e-05
8    2.417056e-93  2.382254e-05  1.204681e-05
9    3.835987e-54  1.904475e-02  1.824737e-08
10  4.353932e-117  1.103915e-05  1.352833e-03
11   1.868050e-02  4.283652e-21  3.319369e-49
12   4.377174e-03  1.032142e-29  4.282698e-68
13   1.387598e-02  4.104454e-22  5.877774e-50
14   3.048005e-02  1.217118e-30  1.016901e-62
15   9.119786e-86  3.773789e-03  2.183311e-06
16  2.290049e-159  3.817596e-08  4.405424e-03
17   1.005900e-53  1.596144e-02  1.749533e-07
18   5.056874e-74  5.160736e-03  4.141960e-05
19  2.878474e-152  1.473105e-09  2.934489e-03
20   2.781532e-02  3.773279e-21  3

In [11]:
y_pred = []
# For each row, calculate the column with the highest probability
for i in np.arange(0, X_test.shape[0]):
    c = np.argmax(np.array(P.iloc[[i]]))
    y_pred.append(classes[c])
y_pred = np.array(y_pred, dtype=int)
print(y_pred)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]


In [12]:
score = accuracy_score(y_pred, y_test)
print('Accuracy:', score)

Accuracy: 1.0


##  2 - Sklearn Implementation

In [13]:
#!pip install micromlgen  

### 2.1 - Importing libraries

In [14]:
from micromlgen import port
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

### 2.2 - Load Dataset

The Iris dataset is a classic dataset in the field of machine learning and statistics. It was introduced by Sir Ronald A. Fisher in 1936 as an example of discriminant analysis. The dataset is often used for educational purposes and is a common starting point for the practice of pattern classification.


Attributes:

- Sepal length (in centimeters)

- Sepal width (in centimeters)

- Petal length (in centimeters)

- Petal width (in centimeters)

Species:

- 0 - Setosa

- 1 - Versicolor

- 2 - Virginica

In [15]:
X, y = load_iris(return_X_y=True)

In [16]:
print('Input shape: ', X.shape)
print('Target variable shape: ', y.shape)

Input shape:  (150, 4)
Target variable shape:  (150,)


### 1.3 - Split into training and test data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

### 1.4 - Create the classification model

In [18]:
model = GaussianNB()

### 1.5 - Train the model

In [19]:
model.fit(X_train, y_train)

### 1.6 - Evaluating the model with the training data

In [20]:
training_predict = model.predict(X_train)

In [21]:
score = accuracy_score(training_predict, y_train)
print('Accuracy:', score)

Accuracy: 0.9428571428571428


In [22]:
print(metrics.classification_report(y_train, training_predict, digits = 3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        31
           1      0.919     0.919     0.919        37
           2      0.919     0.919     0.919        37

    accuracy                          0.943       105
   macro avg      0.946     0.946     0.946       105
weighted avg      0.943     0.943     0.943       105



In [23]:
print(metrics.confusion_matrix(y_train, training_predict))

[[31  0  0]
 [ 0 34  3]
 [ 0  3 34]]


### 1.7 - Evaluating the model with test data

In [24]:
test_predict = model.predict(X_test)

In [25]:
score = accuracy_score(test_predict, y_test)
print('Accuracy:', score)

Accuracy: 0.9777777777777777


In [26]:
print(metrics.classification_report(y_test, test_predict, digits = 3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        19
           1      1.000     0.923     0.960        13
           2      0.929     1.000     0.963        13

    accuracy                          0.978        45
   macro avg      0.976     0.974     0.974        45
weighted avg      0.979     0.978     0.978        45



In [27]:
print(metrics.confusion_matrix(y_test, test_predict))

[[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]


### 1.8 - Obtaining the model to be implemented in the microcontroller

In [28]:
print(port(model))

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class GaussianNB {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        float votes[3] = { 0.0f };
                        float theta[4] = { 0 };
                        float sigma[4] = { 0 };
                        theta[0] = 4.964516129032; theta[1] = 3.377419354839; theta[2] = 1.464516129032; theta[3] = 0.248387096774;
                        sigma[0] = 0.111966704288; sigma[1] = 0.136586891592; sigma[2] = 0.033257026868; sigma[3] = 0.011529659543;
                        votes[0] = 0.295238095238 - gauss(x, theta, sigma);
                        theta[0] = 5.862162162162; theta[1] = 2.724324324324; theta[2] = 4.210810810811; theta[3] = 1.302702702703;
                        sigma[0] = 0.275325057719; sigma[1] = 0.087246168019; sigm

### 1.9 - Saves the template in a .h file

In [29]:
with open('./GaussianNB/GaussianNB.h', 'w') as file:   
    file.write(port(model, classname='GaussianNB'))
   