In [0]:
%matplotlib inline
from __future__ import print_function

Classification by Gaussian Naive Bayes of 2D data set
===================

Define functions
-------------------
You don't have to care about this cell.

In [0]:
import numpy as np
# Independent bivariate normal joint density
def N2d(x, mu, variance):
    return np.exp(-0.5 * (x[:,0] - mu[0])**2 / variance[0])  *  np.exp(-0.5 * (x[:,1] - mu[1])**2 / variance[1])  /  (2. * np.pi * np.sqrt(variance[0]*variance[1]))

from matplotlib import pyplot as plt
# Visualization of the estimated distributions
def plot2d_GaussianNB(model, X_train, y_train, X_test=None, y_test=None, cmap=None, xlim=None, ylim=None, dxlim=0.5, dylim=0.5, levels=None, linestyles=None, markers=None, colors=None):

    plt.figure()
    ax = plt.axes()

    if xlim is None:
        xlim = [X_train[:, 0].min() - dxlim, X_train[:, 0].max() + dxlim]
    if ylim is None:
        ylim = [X_train[:, 1].min() - dylim, X_train[:, 1].max() + dylim]

    xg, yg = np.arange(xlim[0], xlim[1], (xlim[1]-xlim[0])/300.), np.arange(ylim[0], ylim[1], (ylim[1]-ylim[0])/300.)
    xx, yy = np.meshgrid(xg, yg)    

    if cmap is None:
        cmap = ['Blues', 'Reds', 'Greens', 'BuPu', 'RdPu', 'YlGn']
    if markers is None:
        markers = ['o', 's', '^', '*', '+', 'x']
    if colors is None:
        colors = ['b', 'r', 'g', 'c', 'm', 'y']
    ncmap, nm, nc = len(cmap), len(markers), len(colors)

    ncls = len(model.class_count_)
    for k in range(ncls):
        vmax = 1.  / (2. * np.pi * np.sqrt(model.sigma_[:,0]*model.sigma_[:,1]))
    for k in range(ncls):
        #mean and variance of each feature per class
        mu = model.theta_[k,:]
        variance = model.sigma_[k,:]
        Z = N2d(np.c_[xx.ravel(),yy.ravel()], mu, variance)

        # Put the result into a color plot
        #ax.pcolor(xx, yy, Z, cmap=cmap[k], alpha=0.5, edgecolors=None)
        Zm = np.ma.masked_array(Z, Z < 0.03*vmax[k])
        ax.pcolorfast(xg, yg, Zm.reshape(xx.shape), cmap=cmap[k%ncmap], alpha=0.5)
        if levels is not None:
            ax.contour(xx, yy, Z.reshape(xx.shape), levels=levels, colors='k', linestyles=linestyles, alpha=0.2)
        else:
            levels = np.arange(0, vmax[k], vmax[k]/8.)
            ax.contour(xx, yy, Z.reshape(xx.shape), levels=levels, colors='k', linestyles=linestyles, alpha=0.2)

    if ncls == 2:
        decision_function = lambda X: np.log(np.array(model.predict_proba(X)[:,1])/np.array(model.predict_proba(X)[:,0]))
        Z = decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
        ax.contour(xx, yy, Z.reshape(xx.shape), levels=[0.], colors='k', linestyles=['--'], alpha=1)

    # Plot also the training points
    y = np.unique(y_train)
    for k in range(ncls):
        ax.scatter(X_train[y_train==y[k], 0], X_train[y_train==y[k], 1], c=colors[k%nc],  marker=markers[k%nm], cmap=cmap[k%ncmap], edgecolors='k', label='Training data', alpha=1)
    # and testing points if given
    if X_test is not None and y_test is not None:
        y = np.unique(y_train)
        for k in range(ncls):
            ax.scatter(X_test[y_test==y[k], 0], X_test[y_test==y[k], 1], c=colors[k],  marker=markers[k], cmap=cmap[k], edgecolors='k', label='Test data', alpha=1)
        plt.legend(loc="upper right", fontsize=16, frameon=True)
        ax.get_legend().legendHandles[0].set_color('k')
        ax.get_legend().legendHandles[1].set_color('k')

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    plt.axis('tight')
    plt.xlabel('x1', fontsize=16)
    plt.ylabel('x2', fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.gca().set_aspect('equal')
    plt.tight_layout()
    plt.savefig('rnd2d_ex1_GNB.png', transparent=True,dpi=300)

Make training data
------------------

In [0]:
# Example 1': define manually
X = np.array([[22, 1], [13,2], [19,5], [15,8], [11,10], [7,0]])
y = np.array([1,1,1,1,-1,-1])

In [0]:
# Example 2: draw npos and nneg points from the Gaussian distribution for each class
npos = 30
nneg = 30
np.random.seed(321)
X = np.r_[np.random.randn(npos, 2) + [3, 3], np.random.randn(nneg, 2)]
# [1,1,...,1,-1,-1,...,-1]
y = np.array([1] * npos + [-1] * nneg)

In [0]:
# Example 3: create moons using sklearn
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.2, random_state=0)
y[y==0] = -1

In [0]:
# Example 4: create circles using sklearn
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=150, noise=0.1, random_state=0, factor=0.3)
y[y==0] = -1

Plot the training points

In [0]:
# Plot the training points
ax = plt.figure()
ax = plt.axes()
ax.scatter(X[y>0, 0], X[y>0, 1], c='r',  marker='s', cmap=plt.cm.bwr, edgecolors='k', label='Training data', alpha=1)
ax.scatter(X[y<=0, 0], X[y<=0, 1], c='b', marker='o', cmap=plt.cm.bwr, edgecolors='k', label='Training data', alpha=1)
plt.xlabel('x1', fontsize=16)
plt.ylabel('x2', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.gca().set_aspect('equal')
ax.set_xlim(X[:,0].min()-0.5, X[:,0].max()+0.5)
ax.set_ylim(X[:,1].min()-0.5, X[:,1].max()+0.5)
plt.tight_layout()

Run the training
----------------

In [0]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X,y)

#number of training samples observed in each class
print("# of data: ", model.class_count_)

#probability of each class
print("Prior probs: ", model.class_prior_)

#mean and variance of each feature per class
print("Mean: ", model.theta_)
print("Variance: ", model.sigma_)

Visualize the Gaussian distributions
-----------------------------------------

In [0]:
dlim = np.sqrt(model.sigma_).sum(axis=0)/2
plot2d_GaussianNB(model, X, y, dxlim=dlim[0], dylim=dlim[1])

In [0]:
# Classification test
Xt = np.array([[16,6]])
print( N2d(Xt, model.theta_[0,:], model.sigma_[0,:]) )
print( N2d(Xt, model.theta_[1,:], model.sigma_[1,:]) )
print("Xt = ", "is classified into ", model.predict(Xt), "with probability", np.max(model.predict_proba(Xt)))

In [0]:
from google.colab import files
files.download("rnd2d_ex1_GNB.png")