Preparing the data

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm

data = pd.read_csv("data425288.csv")

T = 25
N = data.ID.max()

y = np.log(data.Sales.values.reshape((N,T)))

X = np.ones((T,2))
X[:,1] = np.log(data.Price[:T])

OLS for reference

In [2]:
import statsmodels.formula.api as sm

regdata = data.copy()
regdata["logS"] = np.log(regdata.Sales)
regdata["logP"] = np.log(regdata.Price)
result = sm.ols(formula="logS ~ logP", data=regdata).fit()
display(result.summary())

0,1,2,3
Dep. Variable:,logS,R-squared:,0.261
Model:,OLS,Adj. R-squared:,0.261
Method:,Least Squares,F-statistic:,4408.0
Date:,"Mon, 09 Dec 2019",Prob (F-statistic):,0.0
Time:,17:19:31,Log-Likelihood:,-19197.0
No. Observations:,12500,AIC:,38400.0
Df Residuals:,12498,BIC:,38410.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.7127,0.023,462.944,0.000,10.667,10.758
logP,-1.3974,0.021,-66.393,0.000,-1.439,-1.356

0,1,2,3
Omnibus:,1.098,Durbin-Watson:,1.596
Prob(Omnibus):,0.577,Jarque-Bera (JB):,1.113
Skew:,-0.002,Prob(JB):,0.573
Kurtosis:,2.954,Cond. No.,4.4


Log-likelihood evaluation

In [3]:
def LogL(theta, pi, y, X):
    mu = np.dot(X,theta.T) #(25, 3)
    mu = np.repeat(mu[np.newaxis, :, :], N, axis=0)  #(500,25,K)
    y = np.repeat(y[:, :, np.newaxis], K, axis=2)  #(500,25,K)
    
    #pdfs
    probs = norm.pdf(y,mu,1) #(500,25,K)
    
    #prod^T probs
    segments = np.prod(probs, axis=1) #(500,K)
    
    #sum^K pi_c prod^T probs
    combined = np.dot(segments, pi) #(500,)
    
    #sum^N log sum^K pi_c prod^T probs
    LogL = np.log(combined).sum() #(1,)
    return LogL

Expectation step

In [4]:
def EStep(theta,pi,y,X, robust = True):
    mu = np.dot(X,theta.T) #(25, K)
    mu = np.repeat(mu[np.newaxis, :, :], N, axis=0)  #(500,25,K)
    y = np.repeat(y[:, :, np.newaxis], K, axis=2)  #(500,25,K)

    #pdfs
    probs = norm.pdf(y,mu,1) #(500,25,K)

    #if to be calcluted by exp(log()) to prevent underflow
    if robust:
        #prod^T probs as sum^T log p_t to prevent small numbers
        segments = np.log(probs).sum(axis=1)
        
        # prod^T probs times diagonal of pi as exp(sum^T log p_t + log pi_c - most negative number)
        numerators = np.exp(segments + np.log(pi) - segments.min())    
    
    #using the direct definition
    else:
        #prod^T probs
        segments = np.prod(probs, axis=1) #(500,K)
        
        # prod^T probs times diagonal of pi
        numerators = np.dot(segments, np.diag(pi))
    
    #divide numerators by denominators (= sum of row)
    W = numerators / numerators.sum(axis=1, keepdims=True)

    return W

Maximization step

In [33]:
def MStep(W,y,X):
    # 1/N sum^N w_ic
    pi = W.mean(axis=0) #(K,)
    
    theta = np.zeros((K,2))
    
    for c in range(K):        
        inv = np.zeros((2,2))
        for i in range(N):
            inv += W[i,c] * np.dot(X.T, X) 
        
        inv  =  np.linalg.inv(inv)
        
        theta[c,:] = np.linalg.multi_dot([inv,X.T,y.T, W[:,c]])
    
    return theta, pi

EM algorithm

In [None]:
def EM(K,y,X,tolerance=0.0001, verbose=0):
    #W initialized randomly
    W = np.random.rand(N,K)
    W = W/W.sum(axis=1, keepdims=True)
    theta = None
    pi = None
    
    while True:
        print(K)
    

    
EM(2,y,X)

Estimation implementation

In [None]:
def Estimate(K, X=X, y=y, seed=1234):