In [1]:
import numpy as np
import requests
import matplotlib.pyplot as plt
import math


In [2]:

# Define the myEM function to implement the EM algorithm:
# INPUT:
#   - data: the datase
#   - G: the number of components
#   - initial parameters
#   - itmax: the number of iterations
# OUTPUT:
#   - prob: a G-dimensional probability vector (p1, ..., pG)
#   - mean: A p-by-G matrix with the k-th column being μk, the p-dimensional mean for the k-th Gaussian component.
#   - Sigma: A p-by-p covariance matrix   shared by all G components;
#   - loglik: A number equal to sum_over_N(log(sum_over_G(pk*N(x;mu,sigma))))

def myEM(data, G, mu, sigma, p, itmax):

    likelihood = np.zeros((itmax,))

    for j in range(itmax):
        CondProb = Estep(data, mu, sigma, p, G)
        mu, sigma, p = Mstep(data,CondProb,G)
        likelihood[j] = loglik(data, mu, sigma, p, G, CondProb)

    return mu, sigma, p, likelihood


def Estep(data, mu, sigma, p, G):

    n = data.shape[0]
    d = data.shape[1]
    sigma_invers = np.linalg.inv(sigma)
    sigma_determ = np.linalg.det(sigma)

    Prob_matrix = np.zeros((n,n))
    gaussian_prob = np.zeros((n,G))
    CondProb = np.zeros((n,G))  # an nxG matrix, where each element represents a conditional probability
    factor = math.sqrt( ((2*math.pi)**d)* sigma_determ)
    Prob_ik = np.zeros((n,G))

    for i in range(G):
        mu_i = mu[:,i]
        p_i = p[i]
        dis_i = data - mu_i
        Prob_matrix = np.dot( np.dot(dis_i,sigma_invers), dis_i.T)
        Diag = np.array([Prob_matrix[i][i] for i in range(n)])
        gaussian_prob[:,i] = p_i/factor * np.exp((-1/2)*Diag)
    
    gaussiam_sum = np.sum(gaussian_prob,axis = 1).reshape(n,1)
    CondProb = gaussian_prob/gaussiam_sum

    return CondProb


def Mstep(data,CondProb,G):
    n = data.shape[0]
    d = data.shape[1]

    sum_CondProb = np.sum( CondProb, axis = 0) 
    p = sum_CondProb/n

    mu = np.zeros((d,G))
    sigma = np.zeros((d,d))
    
    for i in range(G):
        mu_i = []
        dis_i = []
        sigma_i = []
        CondProb_i = []
        
        CondProb_i = CondProb[:,i].reshape(n,1)
        mu_i = np.sum((CondProb_i * data),axis = 0)/np.sum(CondProb_i)
        dis_i = np.sqrt(CondProb_i)* (data - mu_i)
        sigma_i = np.dot(dis_i.T,dis_i)/np.sum(CondProb_i)
        mu[:,i] = mu_i
        sigma = sigma + sigma_i
    
    sigma = sigma/G
    
    return mu, sigma, p


def loglik(data, mu, sigma, p, G, CondProb):

    n = data.shape[0]
    d = data.shape[1]
    sigma_invers = np.linalg.inv(sigma)
    sigma_determ = np.linalg.det(sigma)
    factor = math.sqrt( ((2*math.pi)**d)* sigma_determ)

    # Prob_matrix = np.zeros((n,n))
    # gaussian_prob = np.zeros((n,G))
    Prob_ik = np.zeros((n,G))   

    for i in range(G):
        mu_i = mu[:,i]
        p_i = p[i]
        dis_i = data - mu_i
        CondProb_i = CondProb[:,i]
        
        Prob_matrix = np.dot( np.dot(dis_i,sigma_invers), dis_i.T)
        Diag = np.array([Prob_matrix[i][i] for i in range(n)])
        gaussian_prob = p_i/factor * np.exp((-1/2)*Diag)
        Prob_ik[:, i] = CondProb_i * np.log(gaussian_prob)
    
    g = np.sum(Prob_ik)
    return g


def Init(G,data):
    
    n = data.shape[0]
    dimX = data.shape[1]
    mu = np.zeros((dimX, G))
    sigma = np.zeros((dimX, dimX))
    p = np.zeros((G,))

    if G == 2:
        p[0] = 10/n
        p[1] = 1-p[0]
        mu[:,0] = np.mean(data[0:10,:], axis = 0)
        mu[:,1] = np.mean(data[10:,:], axis = 0)
        dis_0 = data[0:10,:] - mu[:,0]
        dis_1 = data[10:,] - mu[:,1]
        sigma = (np.dot(dis_0.T,dis_0) + np.dot(dis_1.T,dis_1))/n
        
    elif G == 3:
        p[0] = 10/n
        p[1] = 20/n
        p[2] = 1-p[0]-p[1]
        mu[:,0] = np.mean(data[0:10,:], axis = 0)
        mu[:,1] = np.mean(data[10:30,:], axis = 0)
        mu[:,2] = np.mean(data[30:,:], axis = 0)
        dis_0 = data[0:10,:] - mu[:,0]
        dis_1 = data[10:30,] - mu[:,1]
        dis_2 = data[30:,] - mu[:,2]
        sigma = (np.dot(dis_0.T,dis_0) + np.dot(dis_1.T,dis_1) + np.dot(dis_2.T,dis_2))/n

    else:
        print('Error: please enetr G=2 or G=3 ')
    
    # print(sigma.shape)
    if sigma.shape[0] != dimX:
        print('Error: wrong array dimension!')
    
    return mu, sigma, p

In [3]:
url = 'https://liangfgithub.github.io/Data/faithful.dat'
response = requests.get(url)
if response.status_code == 200:
    data = response.text

    lines = data.split('\n')[1:]
    data_list = []

    for line in lines:
        values = line.split()  # Assuming columns are separated by spaces
        if len(values) == 3:
            data_list.append([float(val) for val in values])

    # Convert the list of lists to a NumPy array
    data_array = np.array(data_list)
    print(data_array.shape)

else:
    print("Failed to download the data.")


(272, 3)


In [5]:
X = data_array[:,1:]

G = 2
itmax = 20

mu, sigma, p = Init(G,X)
print(p)
print(mu)
print(sigma)
print('\n')

mu_out, sigma_out, p_out, likelihood_out  = myEM(X, G, mu, sigma, p, itmax)

print(p_out)
print(mu_out)
print(sigma_out)

print(likelihood_out[itmax-1])



[0.03676471 0.96323529]
[[ 3.3032      3.49482824]
 [71.8        70.86259542]]
[[  1.29663847  13.93278021]
 [ 13.93278021 184.11269645]]


[0.03860024 0.96139976]
[[ 3.26732996  3.49663429]
 [72.58336774 70.82935345]]
[[  1.27249347  13.84625509]
 [ 13.84625509 185.26992199]]
-1331.5635363526021
