In [1]:
import numpy as np
from data_loader import toy_dataset, load_digits
from gmm_sol import GMM as GMM_SOL
from gmm import GMM as GMM_SB
from utils import Figure
from matplotlib.patches import Ellipse

def compute_elipse_params(variance):
    '''
        Compute elipse params for plotting from variance
    '''

    # http://www.cs.cornell.edu/cv/OtherPdf/Ellipse.pdf Slide 17
    # https://stackoverflow.com/a/41821484

    variance_inv = np.linalg.inv(variance)
    a = variance_inv[0, 0]
    c = variance_inv[1, 1]
    b = variance_inv[0, 1] + variance_inv[1, 0]

    M = (variance_inv + variance_inv.T) / 2
    eig, _ = np.linalg.eig(M)
    if (np.abs(eig[0] - a) < np.abs(eig[0] - c)):
        lambda1, lambda2 = eig
    else:
        lambda2, lambda1 = eig

    angle = np.arctan(b / (a - c)) / 2
    return np.sqrt(1 / lambda1), np.sqrt(1 / lambda2), angle


################################################################################
# GMM on 2D toy dataset
# The dataset is generated from N gaussian distributions equally spaced on N radius circle.
# Here, N=4
# You should be able to visualize the learnt gaussian distribution in plots folder
# Complete implementation of fit function for GMM class in gmm.py
################################################################################
x, y = toy_dataset(4, 100)
init = ['k_means', 'random']
#init = ['k_means']

for i in init:
    n_cluster = 4
    gmm = GMM_SOL(n_cluster=n_cluster, max_iter=1000, init=i, e=1e-6)
    iterations = gmm.fit(x)
    ll = gmm.compute_log_likelihood(x)

    assert gmm.means.shape == (
        n_cluster, 2), 'means should be numpy array with {}X2 shape'.format(n_cluster)

    assert gmm.variances.shape == (
        n_cluster, 2, 2), 'variances should be numpy array with {}X2X2 shape'.format(n_cluster)

    assert gmm.pi_k.shape == (
        n_cluster,), 'pi_k should be numpy vector of size'.format(n_cluster)

    assert iterations > 0 and type(
        iterations) == int, 'Number of updates should be positive integer'

    assert type(ll) == float, 'log-likelihood should be float'

    print('GMM for toy dataset with {} init converged in {} iteration. Final log-likelihood of data: {}'.format(
        i, iterations, ll))

    #np.savez('results/gmm_toy_{}.npz'.format(i), iterations=iterations,
    #         variances=gmm.variances, pi_k=gmm.pi_k, means=gmm.means, log_likelihood=ll, x=x, y=y)

    # plot
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    # fig.ax.scatter(gmm.means[:, 0], gmm.means[:, 1], c='red')
    for component in range(n_cluster):
        a, b, angle = compute_elipse_params(gmm.variances[component])
        e = Ellipse(xy=gmm.means[component], width=a * 5, height=b * 5,
                    angle=angle, alpha=gmm.pi_k[component])
        fig.ax.add_artist(e)
    #fig.savefig('plots/gmm_toy_dataset_{}.png'.format(i))




GMM for toy dataset with k_means init converged in 9 iteration. Final log-likelihood of data: -1663.2697448261301
GMM for toy dataset with random init converged in 30 iteration. Final log-likelihood of data: -1663.2697447807122


In [2]:
for i in init:
    n_cluster = 4
    gmm_sb = GMM_SB(n_cluster=n_cluster, max_iter=1000, init=i, e=1e-6)
    iterations = gmm_sb.fit(x)
    ll = gmm_sb.compute_log_likelihood(x)

    assert gmm_sb.means.shape == (
        n_cluster, 2), 'means should be numpy array with {}X2 shape'.format(n_cluster)

    assert gmm_sb.variances.shape == (
        n_cluster, 2, 2), 'variances should be numpy array with {}X2X2 shape'.format(n_cluster)

    assert gmm_sb.pi_k.shape == (
        n_cluster,), 'pi_k should be numpy vector of size'.format(n_cluster)

    assert iterations > 0 and type(
        iterations) == int, 'Number of updates should be positive integer'

    assert type(ll) == float, 'log-likelihood should be float'

    print('gmm_sb for toy dataset with {} init converged in {} iteration. Final log-likelihood of data: {}'.format(
        i, iterations, ll))

    #np.savez('results/gmm_sb_toy_{}.npz'.format(i), iterations=iterations,
    #         variances=gmm_sb.variances, pi_k=gmm_sb.pi_k, means=gmm_sb.means, log_likelihood=ll, x=x, y=y)

    # plot
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    # fig.ax.scatter(gmm_sb.means[:, 0], gmm_sb.means[:, 1], c='red')
    for component in range(n_cluster):
        a, b, angle = compute_elipse_params(gmm_sb.variances[component])
        e = Ellipse(xy=gmm_sb.means[component], width=a * 5, height=b * 5,
                    angle=angle, alpha=gmm_sb.pi_k[component])
        fig.ax.add_artist(e)
    #fig.savefig('plots/gmm_sb_toy_dataset_{}.png'.format(i))


gmm_sb for toy dataset with k_means init converged in 8 iteration. Final log-likelihood of data: -1663.2697448261306
gmm_sb for toy dataset with random init converged in 29 iteration. Final log-likelihood of data: -1663.2697447807122


In [3]:
gmm_sb.pi_k

array([0.25750014, 0.26733667, 0.2375816 , 0.2375816 ])

In [4]:
print(gmm.variances)
print(gmm_sb.variances)

[[[ 0.93780626 -0.039491  ]
  [-0.039491    0.86952032]]

 [[ 0.98443845 -0.12468089]
  [-0.12468089  1.05935247]]

 [[ 1.01095646  0.0670606 ]
  [ 0.0670606   1.01485421]]

 [[ 0.75326161 -0.01673937]
  [-0.01673937  1.04221188]]]
[[[17.53193796 17.53093796]
  [17.53093796 17.53193796]]

 [[17.18975191 17.18875191]
  [17.18875191 17.18975191]]

 [[18.04460737 18.04360737]
  [18.04360737 18.04460737]]

 [[18.04460737 18.04360737]
  [18.04360737 18.04460737]]]


In [11]:
a = np.array([1, -2, 3])
b = np.array([2, 1, -1])
a.T@b

-3

In [14]:
a.reshape([-1,1])

array([[ 1],
       [-2],
       [ 3]])

In [9]:
b.shape

(1, 3)