In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.special import logsumexp
from matplotlib import pyplot as plt


def get_random_psd(n):
    x = np.random.normal(0, 1, size=(n, n))
    return np.dot(x, x.transpose())


def initialize_random_params(k):
    params = {'phi': np.random.uniform(0, 1, size=(k,)),
              'mu': [np.random.normal(0, 1, size=(2,)) for _ in range(k)],
              'sigma': [get_random_psd(2) for _ in range(k)]}
    return params



def learn_params(x_labeled, y_labeled):
    n = x_labeled.shape[0]
    k = len(np.unique(y_labeled))
    phi = [np.sum(y_labeled == i) / n for i in range(k)]
    mu = [np.sum(x_labeled[y_labeled == i], axis=0) / np.sum(y_labeled == i) for i in range(k)]
    sigma = [np.cov(x_labeled[y_labeled == i].T, bias=True) for i in range(k)]
    return {'phi': phi, 'mu': mu, 'sigma': sigma}


def e_step(x, params):
    k = len(params['phi'])
    log_p_y_x = np.zeros((x.shape[0], k))
    for i in range(k):
        log_p_y_x[:, i] = np.log(params['phi'][i]) + \
                          stats.multivariate_normal.logpdf(x, params['mu'][i], params['sigma'][i])
    log_p_y_x_norm = logsumexp(log_p_y_x, axis=1)
    p_y_x = np.exp(log_p_y_x - log_p_y_x_norm[:, np.newaxis])
    return log_p_y_x_norm, p_y_x


def m_step(x, params):
    k = len(params['phi'])
    n = x.shape[0]
    _, p_y_x = e_step(x, params)
    phi = np.sum(p_y_x, axis=0) / n
    mu = [np.sum(p_y_x[:, i, np.newaxis] * x, axis=0) / np.sum(p_y_x[:, i]) for i in range(k)]
    sigma = [((x - mu[i]).T.dot((x - mu[i]) * p_y_x[:, i, np.newaxis])) / np.sum(p_y_x[:, i]) for i in range(k)]
    return {'phi': phi, 'mu': mu, 'sigma': sigma}


def get_avg_log_likelihood(x, params):
    loglikelihood, _ = e_step(x, params)
    return np.mean(loglikelihood)


def run_em(x, params):
    avg_loglikelihoods = []
    while True:
        avg_loglikelihood = get_avg_log_likelihood(x, params)
        avg_loglikelihoods.append(avg_loglikelihood)
        if len(avg_loglikelihoods) > 2 and abs(avg_loglikelihoods[-1] - avg_loglikelihoods[-2]) < 0.0001:  # break condition when the loglikelihood is not changing
            break
        params = m_step(x, params)
    print("EM algorithm converged.")
    print("Final Parameters:")
    print(f"\tphi: {params['phi']}")
    for i in range(len(params['mu'])):
        print(f"\tmu_{i}: {params['mu'][i]}")
    for i in range(len(params['sigma'])):
        print(f"\tsigma_{i}:\n{params['sigma'][i]}")

    _, posterior = e_step(x, params)
    forecasts = np.argmax(posterior, axis=1)
    return forecasts, posterior, avg_loglikelihoods

In [2]:
# Provide the path to your text file
file_path = "https://www.ccs.neu.edu/home/vip/teach/DMcourse/2_cluster_EM_mixt/HW2/2gaussian.txt"

# Read the data into a dataframe
df = pd.read_csv(file_path, sep=" ", header=None, names=['x1', 'x2'])
x = df.values

### Results for 2 gaussian

In [3]:
# Unsupervised learning
print("Learned Parameter For 2 gaussian: ")
random_params = initialize_random_params(2)
unsupervised_forecasts, unsupervised_posterior, unsupervised_loglikelihoods = run_em(x, random_params)
print("total steps: ", len(unsupervised_loglikelihoods))
    # Semi-supervised learning

print(unsupervised_posterior)

Learned Parameter For 2 gaussian: 
EM algorithm converged.
Final Parameters:
	phi: [0.66766727 0.33233273]
	mu_0: [7.00548219 3.97874915]
	mu_1: [2.97974693 3.05400601]
	sigma_0:
[[0.98838313 0.50522166]
 [0.50522166 1.00761801]]
	sigma_1:
[[0.98688646 0.03010652]
 [0.03010652 2.94910468]]
total steps:  19
[[9.99990424e-01 9.57645720e-06]
 [9.99985386e-01 1.46135232e-05]
 [1.85820837e-03 9.98141792e-01]
 ...
 [9.78646884e-01 2.13531160e-02]
 [9.99999751e-01 2.49061671e-07]
 [1.48525386e-04 9.99851475e-01]]


In [4]:
cluster_0_data = x[unsupervised_forecasts==0]
cluster_1_data = x[unsupervised_forecasts==1]

# Compute the mean of each cluster
mean_cluster_0 = np.mean(cluster_0_data, axis=0)
mean_cluster_1 = np.mean(cluster_1_data, axis=0)

# Compute the covariance matrix of each cluster
cov_cluster_0 = np.cov(cluster_0_data.T)
cov_cluster_1 = np.cov(cluster_1_data.T)

print('n1=',len(cluster_0_data))
print('n2=',len(cluster_1_data))

print(f"mean_cluster_0 = {mean_cluster_0}")
print(f"mean_cluster_1 = {mean_cluster_1}")
print(f"cov_cluster_0 = {cov_cluster_0}")
print(f"cov_cluster_1 = {cov_cluster_1}")

n1= 4015
n2= 1985
mean_cluster_0 = [7.00972229 3.98264004]
mean_cluster_1 = [2.95292525 3.04194494]
cov_cluster_0 = [[0.95711426 0.48533666]
 [0.48533666 0.99173716]]
cov_cluster_1 = [[0.90780697 0.01158188]
 [0.01158188 2.97341593]]


### Resutls for 3 gaussian

In [5]:
# Provide the path to your text file
file_path = "https://www.ccs.neu.edu/home/vip/teach/DMcourse/2_cluster_EM_mixt/HW2/3gaussian.txt"

# Read the data into a dataframe
df = pd.read_csv(file_path, sep=" ", header=None, names=['x1', 'x2'])
x = df.values

In [6]:
# Unsupervised learning
print("Learned Parameter For 2 gaussian: ")
random_params = initialize_random_params(3)
unsupervised_forecasts, unsupervised_posterior, unsupervised_loglikelihoods = run_em(x, random_params)
print("total steps: ", len(unsupervised_loglikelihoods))

Learned Parameter For 2 gaussian: 
EM algorithm converged.
Final Parameters:
	phi: [0.0239966  0.81364877 0.16235464]
	mu_0: [1.93805307 1.57741035]
	mu_1: [5.72489391 5.86096335]
	mu_2: [3.08911175 3.02423879]
	sigma_0:
[[ 0.54858554 -0.45352903]
 [-0.45352903  2.1229826 ]]
	sigma_1:
[[ 1.97720028 -1.05216388]
 [-1.05216388  3.0454697 ]]
	sigma_2:
[[ 0.83410566 -0.33407792]
 [-0.33407792  3.00906205]]
total steps:  30


In [7]:
cluster_0_data = x[unsupervised_forecasts==0]
cluster_1_data = x[unsupervised_forecasts==1]
cluster_2_data = x[unsupervised_forecasts==2]

# Compute the mean of each cluster
mean_cluster_0 = np.mean(cluster_0_data, axis=0)
mean_cluster_1 = np.mean(cluster_1_data, axis=0)
mean_cluster_2 = np.mean(cluster_2_data, axis=0)

# Compute the covariance matrix of each cluster
cov_cluster_0 = np.cov(cluster_0_data.T)
cov_cluster_1 = np.cov(cluster_1_data.T)
cov_cluster_2 = np.cov(cluster_2_data.T)

print('n1= ',len(cluster_0_data))
print('n2= ',len(cluster_1_data))
print('n3= ',len(cluster_2_data))

print(f"mean_cluster_0 = {mean_cluster_0}")
print(f"mean_cluster_1 = {mean_cluster_1}")
print(f"mean_cluster_2 = {mean_cluster_2}")
print(f"cov_cluster_0 = {cov_cluster_0}")
print(f"cov_cluster_1 = {cov_cluster_1}")
print(f"cov_cluster_2 = {cov_cluster_2}")

n1=  175
n2=  8177
n3=  1648
mean_cluster_0 = [1.55056557 1.16725952]
mean_cluster_1 = [5.72474332 5.87154429]
mean_cluster_2 = [3.02083533 2.88852228]
cov_cluster_0 = [[ 0.34258652 -0.45329621]
 [-0.45329621  1.46507752]]
cov_cluster_1 = [[ 1.95327336 -1.09120231]
 [-1.09120231  3.02391928]]
cov_cluster_2 = [[ 0.70722046 -0.51892451]
 [-0.51892451  2.57829483]]
