## Config

In [1]:
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
from numpy import outer, eye, ones, zeros, diag, log, sqrt, exp, pi
from numpy.linalg import inv, solve
from numpy.random import multivariate_normal as mvnormal, normal, gamma, beta, binomial
from scipy.special import gammaln
from scipy.stats import norm, multivariate_normal

from numpy import zeros
from numpy.random import randn

import numpy as np

import matplotlib.pyplot as plt
from numpy import arange, min, max, sqrt, mean, std
from scipy.spatial.distance import cosine

import copy

In [4]:
np.seterr(all='raise') # TODO REMOVE

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

## EM-algorithm

In [14]:
# Aalto University, School of Science
# T-61.5140 Machine Learning: Advanced probabilistic Methods
# Author: antti.kangasraasio@aalto.fi, 2016

mm_cumulative_error = 0.0 # TODO REMOVE
mm_max_diff = 0.0 # TODO REMOVE

class EM_algo():
    """
        A superclass for different EM-fitted models.
    """

    def __init__(self, hyperparams, X=None, Y=None, ndata=0, pdata=0):
        """
            Initialize model based either on given data (X, Y) or
            on given data dimensionality (ndata, pdata).
        """
        if not X is None and not Y is None:
            self.X = X
            self.Y = Y
            self.ndata = len(self.X)
            self.pdata = len(self.X[0])
        if ndata and pdata:
            self.X = None
            self.Y = None
            self.ndata = ndata
            self.pdata = pdata
        self.h = hyperparams
        self.p = dict() # model parameters
        self.reset()
        if not X is None and not Y is None:
            self.current_logl, self.cll = self.logl()


    def reset(self):
        """
            Reset priors and draw parameter estimates from prior.
        """
        raise NotImplementedError("Subclass implements")


    def draw(self, item):
        """
            Draw a data sample from the current predictive distribution.
            Returns the drawn y and z-values.
        """
        raise NotImplementedError("Subclass implements")


    def logl(self):
        """
            Calculates the full log likelihood for this model.
            Returns the logl (and the values of each term for debugging purposes)
        """
        raise NotImplementedError("Subclass implements")


    def EM_iter(self):
        """
            Executes a single round of EM updates for this model.
        """
        raise NotImplementedError("Subclass implements")


    def EM_fit(self, alim=1e-10, maxit=1e4):
        """
            Calls the EM_iter repeatedly until the log likelihood
            of the model increases less than 'alim' in absolute
            value or after 'maxit' iterations have been done.

            Returns the number of EM-iterations, final log likelihood
            value and a string that explains the end condition.
        """
        logl, ll = self.logl()
        for i in range(int(maxit)):
            self.EM_iter()
            logl2, ll2 = self.logl()
            adiff = abs(logl2 - logl)
            if adiff < alim:
                return i+1, logl2, "alim"
            logl = logl2
        return maxit, logl2, "maxit"


    def assert_logl_increased(self, event):
        """
            Checks that the log likelihood increased since model
            initialization or the time this function was last called.
        """
        newlogl, ll = self.logl()
        global mm_cumulative_error # TODO REMOVE
        global mm_max_diff # TODO REMOVE
        if self.current_logl > newlogl:
            if mm_max_diff < self.current_logl - newlogl:
                mm_max_diff = self.current_logl - newlogl
            mm_cumulative_error = mm_cumulative_error + (self.current_logl - newlogl)
        if self.current_logl - newlogl > 0.5:
            print("Likelihood decreased by: " + str(self.current_logl - newlogl))
#         if self.current_logl - newlogl > 1e-3:
            raise ValueError("logl decreased after %s" % (event))
        self.current_logl, self.cll = newlogl, ll


    def get_p(self):
        """
            Returns a copy of the model parameters.
        """
        return copy.deepcopy(self.p)


    def set_p(self, p):
        """
            Sets the model parameters.
        """
        self.p = p.copy()


    def print_p(self):
        """
            Prints the model parameters, one at each line.
        """
        for k, v in self.p.items():
            print("%s = %s" % (k, v))


    def pretty_vector(self, x):
        """
            Returns a formatted version of a vector.
        """
        s = ["("]
        s.extend(["%.2f, " % (xi) for xi in x[:-1]])
        s.append("%.2f)" % (x[-1]))
        return "".join(s)


    def debug_logl(self, ll1, ll2):
        """
            Prints an analysis of the per-term change in
            log likelihood from ll1 to ll2.
        """
        print("Logl      before     after")
        for v1, v2, i in zip(ll1, ll2, range(len(ll1))):
            if v1 > v2:
                d = ">"
            elif v2 > v1:
                d = "<"
            else:
                d = "="
            print("Term %02d: %7.3f %s %7.3f" % (i, v1, d, v2))
        v1 = sum(ll1)
        v2 = sum(ll2)
        if v1 > v2:
            d = ">"
        elif v2 > v1:
            d = "<"
        else:
            d = "="
        diff = v2-v1
        print("Total    %7.3f %s %7.3f   diff: %7.3f" % (v1, d, v2, diff))



## Linear model

In [6]:
# Aalto University, School of Science
# T-61.5140 Machine Learning: Advanced probabilistic Methods
# Author: antti.kangasraasio@aalto.fi, 2016

class EM_algo_LM(EM_algo):
    """
        A linear gaussian model.
    """

    def reset(self):
        """
            Reset priors and draw parameter estimates from prior.
        """
        # priors
        self.lbd_phi0       = self.h["lbd_phi0"]
        self.alpha_s20      = self.h["alpha_s20"]
        self.beta_s20       = self.h["beta_s20"]
        self.sigma_phi0     = eye(self.pdata) * self.h["lbd_phi0"]
        self.sigma_phi0_inv = eye(self.pdata) / self.h["lbd_phi0"]
        self.mu_phi0        = ones(self.pdata) * self.h["mu_phi0"]

        # initial parameter estimates drawn from prior
        self.p           = dict()
        self.p["sigma2"] = 1.0 / gamma(self.alpha_s20, 1.0 / self.beta_s20) # inverse gamma
        self.p["phi"]    = mvnormal(self.mu_phi0, self.p["sigma2"] * self.sigma_phi0)


    def draw(self, item):
        """
            Draw a data sample from the current predictive distribution.
            Returns the y-value (and a constant z-value for compatibility)
        """
        mean = float(item.dot(self.p["phi"]))
        std  = sqrt(self.p["sigma2"])
        return normal(mean, std), 1


    def logl(self):
        """
            Calculates the full log likelihood for this model.
            Returns the logl (and the values of each term for debugging purposes)
        """
        ll    = zeros(8)
        phie  = self.p["phi"] - self.mu_phi0
        err   = (self.X.dot(self.p["phi"]) - self.Y) ** 2
        # p(y)
        ll[0] = - 0.5 * log(2 * pi * self.p["sigma2"]) * self.ndata
        ll[1] = sum(- 0.5 * err / self.p["sigma2"])
        # p(phi)
        ll[2] = - 0.5 * log(2 * pi * self.lbd_phi0 * self.p["sigma2"]) * self.pdata
        ll[3] = - 0.5 * phie.T.dot(phie) / (self.lbd_phi0 * self.p["sigma2"])
        # p(sigma2)
        ll[4] = self.alpha_s20 * log(self.beta_s20)
        ll[5] = - gammaln(self.alpha_s20)
        ll[6] = - (self.alpha_s20 + 1.0) * log(self.p["sigma2"])
        ll[7] = - self.beta_s20 / self.p["sigma2"]
        return sum(ll), ll


    def EM_iter(self):
        """
            Executes a single round of EM updates for this model.

            Has checks to make sure that updates increase logl and
            that parameter values stay in sensible limits.
        """
        # phi
        sumxx         = self.X.T.dot(self.X)
        sumxy         = self.X.T.dot(self.Y)
        sigma_mu      = self.sigma_phi0_inv.dot(self.mu_phi0)
        sigma_phi_inv = self.sigma_phi0_inv + sumxx
        self.p["phi"] = solve(sigma_phi_inv, sigma_mu + sumxy)
        self.assert_logl_increased("phi update")

        # sigma2
        phie = (self.p["phi"] - self.mu_phi0) ** 2
        err  = (self.X.dot(self.p["phi"]) - self.Y) ** 2
        num  = self.beta_s20 + 0.5 * sum(err) + 0.5 * sum(phie) / self.lbd_phi0
        den  = self.alpha_s20 + 1.0 + 0.5 * (self.ndata + self.pdata)
        self.p["sigma2"] = num / den
        if self.p["sigma2"] < 0.0:
            raise ValueError("sigma2 < 0.0")
        self.assert_logl_increased("sigma2 update")


    def print_p(self):
        """
            Prints the model parameters, one at each line.
        """
        print("phi    : %s" % (self.pretty_vector(self.p["phi"])))
        print("sigma2 : %.3f" % (self.p["sigma2"]))



## Generator

In [7]:
# Aalto University, School of Science
# T-61.5140 Machine Learning: Advanced probabilistic Methods
# Author: antti.kangasraasio@aalto.fi, 2016

def generate_X(ndata, pdata):
    """
        Return a matrix of normally distributed random values.
    """
    X = randn(ndata, pdata)
    return X


def generate_YZ(X, distribution):
    """
        Draw observations Y and latent variable values Z from a distribution.
    """
    ndata = len(X)
    Y = zeros(ndata)
    Z = zeros(ndata)
    for i in range(ndata):
        Y[i], Z[i] = distribution.draw(X[i])
    return Y, Z


def get_hyperp():
    """
        Return model hyperparameters.
    """
    return {
            "alpha_s20": 5.0,
            "beta_s20" : 1.0,
            "lbd_phi0" : 1.0,
            "mu_phi0"  : 0.0,
            "alpha_w0" : 3.0,
            "beta_w0"  : 3.0,
            }


## Mixture model

In [31]:
class EM_algo_MM(EM_algo):
    """
        A mixture of two linear models.
    """

    def reset(self):
        """
            Reset priors and draw parameter estimates from prior.
        """
        # priors
        self.alpha_w0       = self.h["alpha_w0"]
        self.beta_w0        = self.h["beta_w0"]

        # Same priors for phi1 and phi2, s2_1, s2_2, don't bother to copy vars twice
        # i.e. alpha_s2_1_0 = alpha_s2_2_0 = alpha_s20
        self.lbd_phi0       = self.h["lbd_phi0"]
        self.alpha_s20      = self.h["alpha_s20"]
        self.beta_s20       = self.h["beta_s20"]
        self.sigma_phi0     = eye(self.pdata) * self.h["lbd_phi0"]
        self.sigma_phi0_inv = eye(self.pdata) / self.h["lbd_phi0"]
        self.mu_phi0        = ones(self.pdata) * self.h["mu_phi0"]
        
        # Precalculations:
        self.w_gamma_ln_multiplier  = gammaln(self.alpha_w0 + self.beta_w0)
        self.w_gamma_ln_multiplier -= gammaln(self.alpha_w0)
        self.w_gamma_ln_multiplier -= gammaln(self.beta_w0)
        
        # initial parameter estimates drawn from prior
        self.p             = dict()
        # Weights
        self.p["w"]        = beta(self.alpha_w0, self.beta_w0)
        # Responsibilities
        self.gamma         = binomial(1, self.p["w"], self.ndata)
        # Component 1
        self.p["sigma2_1"] = 1.0 / gamma(self.alpha_s20, 1.0 / self.beta_s20) # inverse gamma
        self.p["phi_1"]    = mvnormal(self.mu_phi0, self.p["sigma2_1"] * self.sigma_phi0)
        # Component 2
        self.p["sigma2_2"] = 1.0 / gamma(self.alpha_s20, 1.0 / self.beta_s20) # inverse gamma
        self.p["phi_2"]    = mvnormal(self.mu_phi0, self.p["sigma2_2"] * self.sigma_phi0)


    def draw(self, item):
        """
            Draw a data sample from the current predictive distribution.
            Returns the y-value and z-value
        """
        mean1 = float(item.dot(self.p["phi_1"]))
        std1  = sqrt(self.p["sigma2_1"])
        mean2 = float(item.dot(self.p["phi_2"]))
        std2  = sqrt(self.p["sigma2_2"])
        
        if np.random.rand() < self.p["w"]:
            return normal(mean1, std1), 1
        else:
            return normal(mean2, std2), 0


    def logl(self, really=False):
        """
            Calculates the full log likelihood for this model.
            Returns the logl (and the values of each term for debugging purposes)
        """
        
        if not really:
            return self.incompletelogl()
        
        ll         = zeros(20)
        phi_1_diff = self.p["phi_1"] - self.mu_phi0
        phi_2_diff = self.p["phi_2"] - self.mu_phi0
        phi_1_err  = phi_1_diff.T.dot(phi_1_diff)
        phi_2_err  = phi_2_diff.T.dot(phi_2_diff)
        err_1      = (self.Y - self.X.dot(self.p["phi_1"])) ** 2
        err_2      = (self.Y - self.X.dot(self.p["phi_2"])) ** 2
        
        gamma = self.gamma
        
        ### posterior factorizes p(y,z,w,phi,sigma) = p(y,z)p(w)p(phi)p(sigma)
        #                                           = p(y)p(z)p(w)p(phi)p(sigma)
        
        ### p(y,z)
        ll[0] =     gamma.dot(    self.p["w"]  * norm.logpdf(self.Y, self.X.dot(self.p["phi_1"]), sqrt(self.p["sigma2_1"])) )
        ll[1] = (1-gamma).dot( (1-self.p["w"]) * norm.logpdf(self.Y, self.X.dot(self.p["phi_2"]), sqrt(self.p["sigma2_2"])) )
        
        ### p(z) already in p(y,z)
#         ll[4] = np.sum((gamma * log(self.p["w"])) + ((1 - gamma) * log(1 - self.p["w"])))
        
        ### p(w)
        ll[5] = self.w_gamma_ln_multiplier
        ll[6] = (self.alpha_w0 - 1) * self.p["w"]
        ll[7] = (self.beta_w0  - 1) * (1 - self.p["w"])

        ### p(phi)
        # phi_1
        ll[8]  = - 0.5 * ( self.pdata * log(2 * pi * self.p["sigma2_1"]) + log(self.lbd_phi0) )
        ll[9]  = - 0.5 * phi_1_err / (self.lbd_phi0 * self.p["sigma2_1"])
        # phi_2
        ll[10] = - 0.5 * ( self.pdata * log(2 * pi * self.p["sigma2_2"]) + log(self.lbd_phi0) )
        ll[11] = - 0.5 * phi_2_err / (self.lbd_phi0 * self.p["sigma2_2"])
        
        ### p(sigma2)
        # sigma2_1
        ll[12] = self.alpha_s20 * log(self.beta_s20)
        ll[13] = - gammaln(self.alpha_s20)
        ll[14] = - (self.alpha_s20 + 1.0) * log(self.p["sigma2_1"])
        ll[15] = - self.beta_s20 / self.p["sigma2_1"]
        # sigma2_2
        ll[16] = self.alpha_s20 * log(self.beta_s20)
        ll[17] = - gammaln(self.alpha_s20)
        ll[18] = - (self.alpha_s20 + 1.0) * log(self.p["sigma2_2"])
        ll[19] = - self.beta_s20 / self.p["sigma2_2"]
        
        return np.sum(ll), ll


    def incompletelogl(self):
        """
            Calculates the incomplete data log likelihood for this model.
            Returns the incomplete logl (and the values of each term for debugging purposes)
        """
        ll         = zeros(20)
        phi_1_diff = self.p["phi_1"] - self.mu_phi0
        phi_2_diff = self.p["phi_2"] - self.mu_phi0
        phi_1_err  = phi_1_diff.T.dot(phi_1_diff)
        phi_2_err  = phi_2_diff.T.dot(phi_2_diff)
        
        ### p(y)
        N1 = norm.pdf(self.Y, self.X.dot(self.p["phi_1"]), sqrt(self.p["sigma2_1"]))
        N2 = norm.pdf(self.Y, self.X.dot(self.p["phi_2"]), sqrt(self.p["sigma2_2"]))
        ll[0] = np.sum( np.log( self.p["w"]*N1 + (1-self.p["w"])*N2 ) )
        
        ### p(w)
        ll[1] = self.w_gamma_ln_multiplier
        ll[2] = (self.alpha_w0 - 1) * self.p["w"]
        ll[3] = (self.beta_w0  - 1) * (1 - self.p["w"])

        ### p(phi)
        # phi_1
        ll[4]  = - 0.5 * ( self.pdata * log(2 * pi * self.p["sigma2_1"]) + log(self.lbd_phi0) )
        ll[5]  = - 0.5 * phi_1_err / (self.lbd_phi0 * self.p["sigma2_1"])
        # phi_2
        ll[6] = - 0.5 * ( self.pdata * log(2 * pi * self.p["sigma2_2"]) + log(self.lbd_phi0) )
        ll[7] = - 0.5 * phi_2_err / (self.lbd_phi0 * self.p["sigma2_2"])
        
        ### p(sigma2)
        # sigma2_1
        ll[8] = self.alpha_s20 * log(self.beta_s20)
        ll[9] = - gammaln(self.alpha_s20)
        ll[10] = - (self.alpha_s20 + 1.0) * log(self.p["sigma2_1"])
        ll[11] = - self.beta_s20 / self.p["sigma2_1"]
        # sigma2_2
        ll[12] = self.alpha_s20 * log(self.beta_s20)
        ll[13] = - gammaln(self.alpha_s20)
        ll[14] = - (self.alpha_s20 + 1.0) * log(self.p["sigma2_2"])
        ll[15] = - self.beta_s20 / self.p["sigma2_2"]
        
        return np.sum(ll), ll


    def EM_iter(self):
        """
            Executes a single round of EM updates for this model.

            Has checks to make sure that updates increase logl and
            that parameter values stay in sensible limits.
        """
        
        # ==================== E-STEP ====================
      
        # norm.pdf works on a vector, returning probability for each separately
        propto_gamma1 =      self.p["w"]  * norm.pdf(self.Y, self.X.dot(self.p["phi_1"]), sqrt(self.p["sigma2_1"]))
        propto_gamma2 = (1 - self.p["w"]) * norm.pdf(self.Y, self.X.dot(self.p["phi_2"]), sqrt(self.p["sigma2_2"]))

        self.gamma = propto_gamma1 / (propto_gamma1 + propto_gamma2) # responsibilities

        # ==================== M-STEP ====================

        # ========== Component weights w ==========
        num = 2*np.sum(self.gamma) + self.alpha_w0 - 1
        den = 2*self.ndata + self.alpha_w0 + self.beta_w0 - 2
        self.p["w"] = num / den

        self.assert_logl_increased("w")
    
    
        # ========== Variances sigma2 ==========
        # phi_1 and phi_2 still have the previous value, i.e. from step s, we are calculating sigma for step s+1
        
        # sigma2_1
        phie = np.sum((self.p["phi_1"] - self.mu_phi0) ** 2)  / self.lbd_phi0
        phiX = self.p["phi_1"].dot(self.X.T)
        target_err = (self.Y - phiX)**2
        err = self.gamma.dot(target_err)
        num = 2*self.beta_s20 + err + phie
        den = 2*self.alpha_s20 + 2.0 + np.sum(self.gamma) + self.pdata
        self.p["sigma2_1"] = num / den
        if self.p["sigma2_1"] < 0.0:
            raise ValueError("sigma2_1 < 0.0")
        
        # sigma2_2
        phie = np.sum((self.p["phi_2"] - self.mu_phi0) ** 2)  / self.lbd_phi0
        phiX = self.p["phi_2"].dot(self.X.T)
        target_err = (self.Y - phiX)**2
        err = (1-self.gamma).dot(target_err)
        num = 2*self.beta_s20 + err + phie
        den = 2*self.alpha_s20 + 2.0 + np.sum(1-self.gamma) + self.pdata
        self.p["sigma2_2"] = num / den
        if self.p["sigma2_2"] < 0.0:
            raise ValueError("sigma2_2 < 0.0")

#         self.assert_logl_increased("sigma2 update")
        
        
        # ========== Variables phi ==========
        
        # phi_1
        sum_gammayx = self.gamma.T.dot( (Y * self.X.T).T )
        resp_matrix = eye(self.ndata) * self.gamma
        sum_gammaxx = self.X.T.dot(resp_matrix.dot(self.X))
        sigma_mu        = self.sigma_phi0_inv.dot(self.mu_phi0)
        sigma_phi_inv   = self.sigma_phi0_inv + sum_gammaxx
        self.p["phi_1"] = solve(sigma_phi_inv, sigma_mu + sum_gammayx)
        
        # phi_2
        sum_gammayx = (1-self.gamma).T.dot(  (Y * self.X.T).T  )
        resp_matrix = eye(self.ndata) * (1-self.gamma)
        sum_gammaxx = self.X.T.dot(resp_matrix.dot(self.X))
        sigma_mu        = self.sigma_phi0_inv.dot(self.mu_phi0)
        sigma_phi_inv   = self.sigma_phi0_inv + sum_gammaxx
        self.p["phi_2"] = solve(sigma_phi_inv, sigma_mu + sum_gammayx)

        self.assert_logl_increased("phi update")


In [19]:
# get hyperparameters for model
hyperp = get_hyperp()
# generate 50 training data and 20 validation data locations of dim=1
ndata = 50
ndata_v = 50
pdata = 3
X = generate_X(ndata, pdata)
X_v = generate_X(ndata_v, pdata)

true_model = EM_algo_MM(hyperp, ndata=ndata, pdata=pdata)
Y, Z = generate_YZ(X, true_model)
Y_v, Z_v = generate_YZ(X_v, true_model)
print("Generated %d training data and %d validation data from true model:" % \
    (ndata, ndata_v))
true_model.print_p()
print("")

Generated 50 training data and 50 validation data from true model:
phi_2 = [ 0.06157095 -0.04221531  0.31535768]
sigma2_1 = 0.14622875187497333
phi_1 = [ 0.1648381  -0.42739616  0.68170409]
w = 0.5654753035666211
sigma2_2 = 0.2009333607240098



In [None]:
# generate a model for estimating the parameters of the
# true model based on the observations (X, Y) we just made
# model = EM_algo_MM(hyperp, X, Y)
model = EM_algo_MM(hyperp, X, Y)
i, logl, r = model.EM_fit()
print("Model fit (logl %.2f) after %d iterations (%s reached)" % \
        (logl, i, r))
print("")
print("MAP estimate of true model parameters:")
model.print_p()
print("")

In [None]:
# if possible, plot samples, true model and estimated model
if pdata == 1:
    plt.figure(figsize=(20,10))
    plt.scatter(X, Y, s=20, c='black', label="Training data")
#         plt.scatter(X_v, Y_v, s=20, c='orange', label="Validation data")
    x = arange(min(X)-0.1, max(X)+0.1, 0.1)
#         print_linear_model(x, true_model.get_p()["phi"], \
#                 true_model.get_p()["sigma2"], 'red', "True model")
#         print_linear_model(x, model.get_p()["phi"], \
#                 model.get_p()["sigma2"], 'blue', "Predicted model")

    y = true_model.p["phi_1"] * x
    color = 'orange'
    plt.plot(x, y, color, label="true1")
#     plt.fill_between(x, y + 1.96 * sqrt(true_model.p["sigma2_1"]), y - 1.96 * sqrt(true_model.p["sigma2_1"]), alpha=0.1, facecolor=color, interpolate=True)
    
    y = true_model.p["phi_2"] * x
    color = 'green'
    plt.plot(x, y, color, label="true1")
#     plt.fill_between(x, y + 1.96 * sqrt(true_model.p["sigma2_2"]), y - 1.96 * sqrt(true_model.p["sigma2_2"]), alpha=0.1, facecolor=color, interpolate=True)

    # Components
    y = model.p["phi_1"] * x
    color = 'red'
    plt.plot(x, y, color, label="component1")
    plt.fill_between(x, y + 1.96 * sqrt(model.p["sigma2_1"]), y - 1.96 * sqrt(model.p["sigma2_1"]), alpha=0.25, facecolor=color, interpolate=True)

    y = model.p["phi_2"] * x
    color = 'blue'
    plt.plot(x, y, color, label="component2")
    plt.fill_between(x, y + 1.96 * sqrt(model.p["sigma2_2"]), y - 1.96 * sqrt(model.p["sigma2_2"]), alpha=0.25, facecolor=color, interpolate=True)

    plt.legend(loc=1)
    plt.xlim(min(x), max(x))
    plt.xlabel("x")
    plt.ylabel("y")
    plt.show()

In [None]:
mm_cumulative_error = 0.0
mm_max_diff = 0.0
foodict = {'w': 0, 'phi_1': 0, 'phi_2': 0, 'sigma2_1': 0, 'sigma2_2': 0}
bardict = {'w': 0, 'phi_1': 0, 'phi_2': 0, 'sigma2_1': 0, 'sigma2_2': 0}
counter = 0
counterOther = 0
counterAlim = 0
for truru in range(50):
    try:
        # generate a model for estimating the parameters of the
        # true model based on the observations (X, Y) we just made
        model = EM_algo_MM(hyperp, X, Y)
        i, logl, r = model.EM_fit()
        if r == "alim":
            counterAlim += 1
    except ValueError as e:
        counter = counter + 1
        foodict[str(e)] = foodict[str(e)] + 1
    except RuntimeError as e:
        counterOther = counterOther + 1
        bardict[str(e)] = bardict[str(e)] + 1

print(foodict)
print(counter, counterOther, counterAlim)
print(bardict)
print(mm_cumulative_error)
print(mm_max_diff)

In [None]:
# P=1 2*w check each
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 191 809
# {'phi_2': 47, 'w': 0, 'phi_1': 39, 'sigma2_2': 104, 'sigma2_1': 1}
# 97.4147580405
# 3.59291931674
# P=1 w check each
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 200 800
# {'phi_2': 71, 'w': 4, 'phi_1': 32, 'sigma2_2': 93, 'sigma2_1': 0}
# 92.1741867772
# 2.57140877529
# P=1 w check w and after
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 2 998
# {'phi_2': 0, 'w': 2, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 12.8199038625
# 0.191820591439
# P=1 2*w check w and after
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 0 1000
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 2.98402674426
# 0.0398580346214
# prev with update to phi logl calc formula...
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 0 1000
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 2.84134098953
# 0.0422773055005
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 0 1000
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 5.68757981874
# 0.0336705559807

In [None]:
# P=25 2*w check?
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 0 1000
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 3.08312064566
# 0.00761159372594
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 0 0 1000
# {'phi_2': 0, 'w': 0, 'phi_1': 0, 'sigma2_2': 0, 'sigma2_1': 0}
# 3.37453377128
# 0.00951989469854

## Main

In [None]:
# Aalto University, School of Science
# T-61.5140 Machine Learning: Advanced probabilistic Methods
# Author: antti.kangasraasio@aalto.fi, 2016

def main():
    """
        Executed when program is run.
    """
    print("Starting program")
    print("")
    test_LM_model()


def test_LM_model():
    """
        Example that demonstrates how to call the model.
    """
    # get hyperparameters for model
    hyperp = get_hyperp()
    # generate 50 training data and 20 validation data locations of dim=1
    ndata = 20
    ndata_v = 50
    pdata = 3
    X = generate_X(ndata, pdata)
    X_v = generate_X(ndata_v, pdata)
    # intialize true model randomly and draw observations from it
    true_model = EM_algo_LM(hyperp, ndata=ndata, pdata=pdata)
    Y, Z = generate_YZ(X, true_model)
    Y_v, Z_v = generate_YZ(X_v, true_model)
    print("Generated %d training data and %d validation data from true model:" % \
            (ndata, ndata_v))
    true_model.print_p()
    print("")

    # generate a model for estimating the parameters of the
    # true model based on the observations (X, Y) we just made
    model = EM_algo_LM(hyperp, X, Y)
    i, logl, r = model.EM_fit()
    print("Model fit (logl %.2f) after %d iterations (%s reached)" % \
            (logl, i, r))
    print("")
    print("MAP estimate of true model parameters:")
    model.print_p()
    print("")

    # crossvalidate the estimated model with the validation data
    fit_params = model.get_p()
    model_v = EM_algo_LM(hyperp, X_v, Y_v)
    model_v.set_p(fit_params)
    logl, ll = model_v.logl()
    print("Crossvalidated logl: %.2f" % (logl))

    # if possible, plot samples, true model and estimated model
    if pdata != 1:
        return
    plt.figure(figsize=(20,10))
    plt.scatter(X, Y, s=20, c='black', label="Training data")
    plt.scatter(X_v, Y_v, s=20, c='orange', label="Validation data")
    x = arange(min(X)-0.1, max(X)+0.1, 0.1)
    print_linear_model(x, true_model.get_p()["phi"], \
            true_model.get_p()["sigma2"], 'red', "True model")
    print_linear_model(x, model.get_p()["phi"], \
            model.get_p()["sigma2"], 'blue', "Predicted model")
    plt.legend(loc=1)
    plt.xlim(min(x), max(x))
    plt.xlabel("x")
    plt.ylabel("y")
    plt.show()
    


def print_linear_model(x, phi, sigma2, color, label):
    """
        Print linear model mean and 95% confidence interval.
    """
    y = phi * x
    plt.plot(x, y, color, label=label)
    plt.fill_between(x, y + 1.96 * sqrt(sigma2), y - 1.96 * sqrt(sigma2), \
            alpha=0.25, facecolor=color, interpolate=True)


if __name__ == "__main__":
    main()



## Problem 4 - Test mixture model

Generate training and validation data from the mixture model. Analyze how well the model (trained with the training data) can explain the validation data with different data dimensionality and different amounts of generated data when you only do the fitting once. How do the results change, and why, when you start the EM from multiple locations and choose the best fit? 

In [27]:
dimensions = []
datasamples = []


for exponent in range(0, 5):
    dimensions.append(2 ** exponent)
for exponent in range(3, 8):
    datasamples.append(2 ** exponent)

dimensions.reverse()
datasamples.reverse()

dimensions, datasamples

([16, 8, 4, 2, 1], [128, 64, 32, 16, 8])

Loop over combinations of dimensions and dataset sizes and 

1. run a single EM with this setting 
2. run multiple EM's with the same setting
3. compare / log the results

In [33]:
validation_set_size = 0.3
num_multi_inits = 100
epoch = 0

results_table = ("P \t" + "T\t" 
                 + "Single Train\t" + "Multi Train\t" + "δ Train\t\t" 
                 + "Single Test\t" + "Multi Test\t" + "δ Test" + "\n")

for P in dimensions:
    for T in datasamples:
        
        ## INIT
        
        # get hyperparameters for model
        hyperp = get_hyperp()
        # generate data
        ndata = T
        ndata_v = np.ceil(T * 0.3)
        pdata = P
        X = generate_X(ndata, pdata)
        X_v = generate_X(ndata_v, pdata)
        # generate true model
        true_model = EM_algo_MM(hyperp, ndata=ndata, pdata=pdata)
        Y, Z = generate_YZ(X, true_model)
        Y_v, Z_v = generate_YZ(X_v, true_model)
        
        ## Do a single run of EM:
                
        # train
        model_train = EM_algo_MM(hyperp, X, Y)
        i, logl_train_single, r = model_train.EM_fit()
        # test / validate
        fit_params = model_train.get_p()
        model_test = EM_algo_MM(hyperp, X_v, Y_v)
        model_test.set_p(fit_params)
        logl_test_single, ll = model_test.logl()
                    
        ## Do multiple initializations of EM:
        
        # train
        best_model = EM_algo_MM(hyperp, X, Y)
        i, best_logl_train_multi, r = best_model.EM_fit()
        for k in range(0, num_multi_inits):
            
            # print progress
            epoch += 1
            total = len(dimensions) * len(datasamples) * num_multi_inits
            percent = epoch/total*100
            print("%.2f" % percent, end="\r")
            
            model_train = EM_algo_MM(hyperp, X, Y)
            i, logl_train_multi, r = model_train.EM_fit()
            if (logl_train_multi > best_logl_train_multi):
                best_logl_train_multi = logl_train_multi
                best_model = model_train
                
        # test / validate
        fit_params = best_model.get_p()
        model_test = EM_algo_MM(hyperp, X_v, Y_v)
        model_test.set_p(fit_params)
        logl_test_multi, ll = model_test.logl()
        
#         # nice print 
#         results_table += (str(P) + "\t" + str(T) + "\t" 
#                           + "%.2f" % logl_train_single + "\t\t"
#                           + "%.2f" % best_logl_train_multi + "\t\t"
#                           + "%.2f" % abs(logl_train_single - best_logl_train_multi) + "\t\t"
#                           + "%.2f" % logl_test_single + "\t\t"
#                           + "%.2f" % logl_test_multi + "\t\t"
#                           + "%.2f" % abs(logl_test_single - logl_test_multi) + "\n")

        # latex table print
        results_table += (str(P) + " & " + str(T) + " & " 
                          + "%.2f" % logl_train_single + " & "
                          + "%.2f" % best_logl_train_multi + " & "
                          + "%.2f" % (logl_train_single - best_logl_train_multi) + " & "
                          + "%.2f" % logl_test_single + " & "
                          + "%.2f" % logl_test_multi + " & "
                          + "%.2f" % (logl_test_single - logl_test_multi) + "\\\\\n")
            
print(" ")
print("DONE! Multi inits used " + str(num_multi_inits) +" initializations.")
print(" ")
print(results_table)

 
DONE! Multi inits used 100 initializations.
 
P 	T	Single Train	Multi Train	δ Train		Single Test	Multi Test	δ Test
16 & 128 & -239.79 & -150.54 & -89.25 & -118.51 & -77.28 & -41.23\\
16 & 64 & -47.09 & -47.09 & -0.00 & -60.07 & -60.07 & 0.00\\
16 & 32 & -28.29 & -22.46 & -5.84 & -48.46 & -48.81 & 0.35\\
16 & 16 & -3.55 & -3.55 & -0.00 & -105.22 & -105.22 & 0.00\\
16 & 8 & 15.53 & 15.53 & -0.00 & 12.41 & 12.41 & 0.00\\
8 & 128 & -116.32 & -116.32 & -0.00 & -45.82 & -45.82 & 0.00\\
8 & 64 & -27.83 & -27.83 & -0.00 & -17.98 & -17.98 & -0.00\\
8 & 32 & -41.25 & -26.56 & -14.69 & -37.56 & -25.33 & -12.23\\
8 & 16 & -5.51 & -5.01 & -0.51 & -6.24 & -6.14 & -0.10\\
8 & 8 & -4.47 & -0.37 & -4.09 & -9.77 & -14.14 & 4.36\\
4 & 128 & -78.04 & -78.04 & -0.00 & -15.99 & -15.99 & -0.00\\
4 & 64 & -35.86 & -35.86 & -0.00 & -17.61 & -17.61 & 0.00\\
4 & 32 & -19.28 & -19.28 & -0.00 & -8.96 & -8.96 & 0.00\\
4 & 16 & -5.36 & -5.36 & -0.00 & 0.27 & 0.27 & 0.00\\
4 & 8 & -6.85 & -6.85 & -0.00 & -3.41 & -3



## Problem 5 - Compare with linear

Compare the two models (simple linear model and mixture with two linear compo- nents). Do the analyses with both low (eg. 2) and high (eg. 10) data dimensionality as well as with small (eg. 10) and large (eg. 100) amount of samples. Use separate validation set as before.

In [46]:
dimensions_ex5 = [10, 2]
datasamples_ex5 = [100, 10]

Helper function for multi-init train/validation

In [51]:
def test_train(model_class, hyperp, X, Y, X_v, Y_v, inits=5, really=False):
    # train
    best_model = model_class(hyperp, X, Y)
    i, best_logl_train, r = best_model.EM_fit()
    for k in range(0, inits):
        model_train = model_class(hyperp, X, Y)
        i, logl_train, r = model_train.EM_fit()
        if (logl_train > best_logl_train):
            best_logl_train = logl_train
            best_model = model_train

    # test / validate
    fit_params = best_model.get_p()
    model_test = model_class(hyperp, X_v, Y_v)
    model_test.set_p(fit_params)
    if not really:
        logl_test, ll = model_test.logl()
    else:
        logl_test, ll = model_test.logl(really=True)
    
    return best_logl_train, logl_test

### Draw data from the simple linear model, analyze how well each of the candidate models is able to explain the data. 

In [53]:
validation_set_size = 0.3
inits = 5
epoch = 0

results_table = ("P &" + "T\t" + "LM Train\t" + " LM Test\t"
                 + "MM Train\t" + " MM Test\t" + "\n")

for P in dimensions_ex5:
    for T in datasamples_ex5:
        
        # print progress
        epoch += 1
        total = len(dimensions_ex5) * len(datasamples_ex5) 
        percent = epoch/total
        print("epoch " + str(epoch) + " / " + str(total) 
              + " : P=" + str(P) + " T=" + str(T), end="\r")
        
        ## INIT
        
        # get hyperparameters for model
        hyperp = get_hyperp()
        # generate data
        ndata = T
        ndata_v = np.ceil(T * 0.3)
        pdata = P
        X = generate_X(ndata, pdata)
        X_v = generate_X(ndata_v, pdata)

        # generate true linear model
        true_model = EM_algo_LM(hyperp, ndata=ndata, pdata=pdata)
        Y, Z = generate_YZ(X, true_model)
        Y_v, Z_v = generate_YZ(X_v, true_model)
                    
        ## Fit Linear Model:
        LM_best_logl_train, LM_logl_test = test_train(EM_algo_LM, hyperp, X, Y, X_v, Y_v)
        
        ## Fit Mixture Model:
        MM_best_logl_train, MM_logl_test = test_train(EM_algo_MM, hyperp, X, Y, X_v, Y_v, really=True)
        
        results_table += (str(P) + " & " + str(T) + " & " 
                          + ("%.2f" % LM_best_logl_train) + " & "
                          + ("%.2f" % MM_best_logl_train) + " & "
                          + ("%.2f" % LM_logl_test) + " & "
                          + ("%.2f" % MM_logl_test) + " \\\\ " 
                          + "\n")        

print(" ")
print("DONE! Drawn from linear model and results of linear model and mixture model")
print(" ")
print(results_table)

 
DONE! Drawn from linear model and results of linear model and mixture model
 
P &T	LM Train	 LM Test	MM Train	 MM Test	
10 & 100 & -92.81 & -82.74 & -31.51 & -25.79 \\ 
10 & 10 & -10.87 & -2.51 & -13.81 & -4.10 \\ 
2 & 100 & -73.44 & -69.05 & -17.29 & -2.97 \\ 
2 & 10 & -2.37 & 4.07 & 0.10 & 7.03 \\ 





### Draw data from the mixture model, analyze how well each of the candidate models is able to explain the data. 

In [58]:
validation_set_size = 0.3
inits = 5
epoch = 0

results_table = ("P &" + "T\t" + "LM Train\t" + "MM Train\t"
                 + "LM Test\t" + " MM Test\t" + "\n")

for P in dimensions_ex5:
    for T in datasamples_ex5:
        
        # print progress
        epoch += 1
        total = len(dimensions_ex5) * len(datasamples_ex5)
        percent = epoch/total
        print("epoch " + str(epoch) + " / " + str(total) 
              + " : P=" + str(P) + " T=" + str(T), end="\r")
        
        ## INIT
        
        # get hyperparameters for model
        hyperp = get_hyperp()
        # generate data
        ndata = T
        ndata_v = np.ceil(T * 0.3)
        pdata = P
        X = generate_X(ndata, pdata)
        X_v = generate_X(ndata_v, pdata)

        # generate true linear model
        true_model = EM_algo_MM(hyperp, ndata=ndata, pdata=pdata)
        Y, Z = generate_YZ(X, true_model)
        Y_v, Z_v = generate_YZ(X_v, true_model)
                    
        ## Fit Linear Model:
        LM_best_logl_train, LM_logl_test = test_train(EM_algo_LM, hyperp, X, Y, X_v, Y_v)
        
        ## Fit Mixture Model:
        MM_best_logl_train, MM_logl_test = test_train(EM_algo_MM, hyperp, X, Y, X_v, Y_v, really=True)
        
        results_table += (str(P) + " & " + str(T) + " & " 
                          + ("%.2f" % LM_best_logl_train) + " & "
                          + ("%.2f" % MM_best_logl_train) + " & "
                          + ("%.2f" % LM_logl_test) + " & "
                          + ("%.2f" % MM_logl_test) + " \\\\ " 
                          + "\n")        

print(" ")
print("DONE! Drawn from mixture model and results of linear model and mixture model")
print(" ")
print(results_table)

 
DONE! Drawn from mixture model and results of linear model and mixture model
 
P &T	LM Train	MM Train	LM Test	 MM Test	
10 & 100 & -129.27 & -78.85 & -63.96 & -131.84 \\ 
10 & 10 & -2.00 & 6.86 & -48.73 & -26.53 \\ 
2 & 100 & -120.92 & -108.22 & -38.02 & -16.98 \\ 
2 & 10 & -17.21 & -9.64 & -6.82 & 1.21 \\ 





### Draw data from the mixture model, analyze which candidate model is able to explain the data better as a function of the similarity of the two linear components in the true model (e.g. cosine similarity). Explain your findings. 