In [132]:
import numpy as np
from scipy.special import logsumexp
from scipy.stats import binom
import pandas as pd

def initialize_random_params(k):
    params = {'phi': np.random.uniform(0, 1, size=k),
              'p': np.random.uniform(0, 1, size=k)}
    return params

def e_step(x, params):
    k = len(params['phi'])
    log_p_y_x = np.zeros((x.shape[0], k))
    for i in range(k):
        log_p_y_x[:, i] = np.log(params['phi'][i]) + np.sum(binom.logpmf(x, n=1, p=params['p'][i]), axis=1)
    log_p_y_x_norm = logsumexp(log_p_y_x, axis=1)
    p_y_x = np.exp(log_p_y_x - log_p_y_x_norm[:, np.newaxis])
    return log_p_y_x_norm, p_y_x

def m_step(x, params):
    k = len(params['phi'])
    n = x.shape[0]
    _, p_y_x = e_step(x, params)
    phi = np.sum(p_y_x, axis=0) / n
    p = np.mean([np.sum(p_y_x[:, i, np.newaxis] * x, axis=0) / np.sum(p_y_x[:, i]) for i in range(k)], axis=1)

    return {'phi': phi, 'p': p}

def get_avg_log_likelihood(x, params):
    loglikelihood, _ = e_step(x, params)
    return np.mean(loglikelihood)

def run_em(x, params):
    avg_loglikelihoods = []
    while True:
        avg_loglikelihood = get_avg_log_likelihood(x, params)
        avg_loglikelihoods.append(avg_loglikelihood)
        if len(avg_loglikelihoods) > 1 and abs(avg_loglikelihoods[-1] - avg_loglikelihoods[-2]) < 0.00001:
            break
        params = m_step(x, params)
    
    print("EM algorithm converged.")
    print("Final Parameters:")
    print(f"\tphi: {params['phi']}")
    print(f"\tp: {params['p']}")

    _, posterior = e_step(x, params)
    forecasts = np.argmax(posterior, axis=1)
    return forecasts, posterior, avg_loglikelihoods


In [133]:
# Provide the path to your text file
file_path = "https://www.ccs.neu.edu/home/vip/teach/DMcourse/2_cluster_EM_mixt/HW2/coin_flips_outcome.txt"

# Read the data into a dataframe
df = pd.read_csv(file_path, sep=" ", header=None)
x = df.values

In [134]:
# Unsupervised learning
print("Learned Parameter For binomial: ")
random_params = initialize_random_params(3)
unsupervised_forecasts, unsupervised_posterior, unsupervised_loglikelihoods = run_em(x, random_params)
print("total steps: ", len(unsupervised_loglikelihoods))

Learned Parameter For binomial: 
EM algorithm converged.
Final Parameters:
	phi: [0.3058543  0.51552678 0.17861892]
	p: [0.23640626 0.60961574 0.9317092 ]
total steps:  53
