Preparing the data

In [37]:
# !pip install numdifftools
import numpy as np
import pandas as pd
from scipy.stats import norm

data = pd.read_csv("data425288.csv")

T = 25
N = data.ID.max()

y = np.log(data.Sales.values.reshape((N,T)))

X = np.ones((T,2))
X[:,1] = np.log(data.Price[:T])

OLS for reference

In [38]:
import statsmodels.formula.api as sm

regdata = data.copy()
regdata["logS"] = np.log(regdata.Sales)
regdata["logP"] = np.log(regdata.Price)
result = sm.ols(formula="logS ~ logP", data=regdata).fit()
display(result.summary())

0,1,2,3
Dep. Variable:,logS,R-squared:,0.261
Model:,OLS,Adj. R-squared:,0.261
Method:,Least Squares,F-statistic:,4408.0
Date:,"Tue, 10 Dec 2019",Prob (F-statistic):,0.0
Time:,00:49:48,Log-Likelihood:,-19197.0
No. Observations:,12500,AIC:,38400.0
Df Residuals:,12498,BIC:,38410.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.7127,0.023,462.944,0.000,10.667,10.758
logP,-1.3974,0.021,-66.393,0.000,-1.439,-1.356

0,1,2,3
Omnibus:,1.098,Durbin-Watson:,1.596
Prob(Omnibus):,0.577,Jarque-Bera (JB):,1.113
Skew:,-0.002,Prob(JB):,0.573
Kurtosis:,2.954,Cond. No.,4.4


Log-likelihood evaluation

In [39]:
def LogL(theta, pi, y, X):
    K=theta.shape[0]
    mu = np.dot(X,theta.T) #(25, 3)
    mu = np.repeat(mu[np.newaxis, :, :], N, axis=0)  #(500,25,K)
    y = np.repeat(y[:, :, np.newaxis], K, axis=2)  #(500,25,K)
    
    #pdfs
    probs = norm.pdf(y,mu,1) #(500,25,K)
    
    #prod^T probs
    segments = np.prod(probs, axis=1) #(500,K)
    
    #sum^K pi_c prod^T probs
    combined = np.dot(segments, pi) #(500,)
    
    #sum^N log sum^K pi_c prod^T probs
    LogL = np.log(combined).sum() #(1,)
    return LogL

Expectation step

In [16]:
def EStep(theta,pi,y,X, robust = True):
    K=theta.shape[0] 
    mu = np.dot(X,theta.T) #(25, K)
    mu = np.repeat(mu[np.newaxis, :, :], N, axis=0)  #(500,25,K)
    y = np.repeat(y[:, :, np.newaxis], K, axis=2)  #(500,25,K)

    #pdfs
    probs = norm.pdf(y,mu,1) #(500,25,K)
    
    #if to be calcluted by exp(log()) to prevent underflow
    if robust:
        #prod^T probs as sum^T log p_t to prevent small numbers
        segments = np.log(probs).sum(axis=1)
        
        # prod^T probs times diagonal of pi as exp(sum^T log p_t + log pi_c - most negative number)
        numerators = np.exp(segments + np.log(pi) - segments.min())    
    
    #using the direct definition
    else:
        #prod^T probs
        segments = np.prod(probs, axis=1) #(500,K)
        
        # prod^T probs times diagonal of pi
        numerators = np.dot(segments, np.diag(pi))
    
    #divide numerators by denominators (= sum of row)
    W = numerators / numerators.sum(axis=1, keepdims=True)

    return W

Maximization step

In [40]:
def MStep(W,y,X):
    K=W.shape[1]
    
    # 1/N sum^N w_ic
    pi = W.mean(axis=0) #(K,)
    
    theta = np.zeros((K,2))
    
    for c in range(K):        
        inv = W[:,c].sum() * np.dot(X.T, X) 
        inv =  np.linalg.inv(inv)
        
        theta[c,:] = np.linalg.multi_dot([inv,X.T,y.T, W[:,c]])
    
    return theta, pi

EM algorithm

In [18]:
def EM(K,y,X,tolerance=0.0001, verbose=0):
    #W initialized randomly
    W = np.random.rand(N,K)
    W = W/W.sum(axis=1, keepdims=True)
    theta = None
    pi = None
    iterations=0
    max_iter = 500
    different = True
    
    while (different and iterations < max_iter):
        iterations+=1
        theta, pi = MStep(W,y,X)
        currentW = EStep(theta,pi,y,X, robust = True)
        
        #elementwise check for equal weights within tolerance 
        different = not np.allclose(W, currentW, rtol=tolerance, atol=tolerance)
        
        W = currentW
        
        if verbose==2:            
            print("Iteration "+str(iterations))
            print("Log-likelihood: "+str(LogL(theta, pi, y, X)))
            print("\n")
    
    if(iterations == max_iter): print("Maximum iterations reached, iters: ",max_iter)
    if verbose >= 1:
        print("Log-likelihood: "+str(LogL(theta, pi, y, X)))
    
    return theta, pi

Estimation implementation

In [19]:
import numdifftools as nd

def Estimate(K, X=X, y=y, tolerance=0.1, seed=1234,verbose=2, n_est=10):
    theta = None
    pi = None
    Likelihood = -np.inf
    np.random.seed(seed)
    

    #performing the estimations
    print("\nPerforming estimation for K = ",K)
    for i in range(n_est):
        if verbose >= 1: print("\nEstimation ",i+1)
            
        t, p = EM(K,y,X,tolerance=tolerance, verbose=verbose)
        currentLogL = LogL(t, p, y, X)
            
        if currentLogL >= Likelihood:
            theta = t
            pi = p
            Likelihood = currentLogL
    
    if verbose >= 1:
        print("\nThe estimation is finished")
        print("\nLog likelihood: "+str(Likelihood))
        print("\nTheta:")
        print(theta)
        print("\nPi:")
        print(pi)
    
    
    #calcutating the standard errors
    gamma = np.zeros(K)
    for c in range(K-1):
        gamma[c] = np.log(pi[c]) - np.log(pi[K-1])
        
    def hes_eval(input):
        theta = input[:K*2].reshape(K,2)
        gamma = input[K*2:]

        #Restricted Log Likelihood implementation
        pi = np.exp(gamma) / np.exp(gamma).sum()
        return LogL(theta, pi, y, X)
            
    input = np.concatenate((theta, gamma), axis=None)
    hessian = nd.Hessian(hes_eval)(input)
    stderrors = np.sqrt(np.linalg.inv(-1 * hessian).diagonal())
    
    theta_std = stderrors[:K*2].reshape(K,2)
    pi_std = stderrors[K*2:]
    
    #preparing results
    results = [{
            "p": "LogL",
            "value": Likelihood,
            "K": K
        }]
    
    for c in range(K):
        results += [{
            "p": "α "+str(c+1),
            "value": "%.3f (%.3f)" % (theta[c,0], theta_std[c,0]),
            "K": K
        },{
            "p": "β "+str(c+1),
            "value": "%.3f (%.3f)" % (theta[c,1], theta_std[c,1]),
            "K": K
        },{
            "p": "π "+str(c+1),
            "value": "%.3f" % (pi[c]),
            "K": K
        }]

        
    return results

Performing the estimations

In [20]:
!!time
n_estimations = 10
tol = 0.0001

table = pd.DataFrame.from_dict(
     Estimate(K=2, X=X, y=y, tolerance=tol, verbose=1, n_est=n_estimations)
    +Estimate(K=3, X=X, y=y, tolerance=tol, verbose=1, n_est=n_estimations)
    +Estimate(K=4, X=X, y=y, tolerance=tol, verbose=1, n_est=n_estimations)
    +Estimate(K=5, X=X, y=y, tolerance=tol, verbose=1, n_est=n_estimations)
)

pivot = table.pivot(index='p', columns='K', values='value')
display(pivot)
print(pivot.to_latex())


Performing estimation for K =  2

Estimation  1
Log-likelihood: -18224.217260087746

Estimation  2
Log-likelihood: -18224.217260068814

Estimation  3
Log-likelihood: -18224.21726008094

Estimation  4
Log-likelihood: -18224.21726007237

Estimation  5
Log-likelihood: -18224.2172601136

Estimation  6
Log-likelihood: -18224.21726006908

Estimation  7
Log-likelihood: -18224.217260124853

Estimation  8
Log-likelihood: -18224.217260083577

Estimation  9
Log-likelihood: -18224.217260081048

Estimation  10
Log-likelihood: -18224.217260065954

The estimation is finished

Log likelihood: -18224.217260065954

Theta:
[[11.2182371  -1.45823668]
 [10.13854985 -1.371914  ]]

Pi:
[0.55955165 0.44044835]

Performing estimation for K =  3

Estimation  1
Log-likelihood: -18190.812626402185

Estimation  2
Log-likelihood: -18190.812620987723

Estimation  3
Log-likelihood: -18190.81262663784

Estimation  4
Log-likelihood: -18220.512194413022

Estimation  5
Log-likelihood: -18190.812626272687

Estimation  6





Performing estimation for K =  4

Estimation  1
Log-likelihood: -18186.56911113627

Estimation  2
Maximum iterations reached, iters:  500
Log-likelihood: -18186.56966416497

Estimation  3
Log-likelihood: -18186.569103091886

Estimation  4
Log-likelihood: -18186.569099042048

Estimation  5
Log-likelihood: -18186.569114829465

Estimation  6
Log-likelihood: -18186.56910297783

Estimation  7
Log-likelihood: -18186.56910354025

Estimation  8
Log-likelihood: -18186.569098168275

Estimation  9
Log-likelihood: -18186.569096390293

Estimation  10
Log-likelihood: -18186.569101327368

The estimation is finished

Log likelihood: -18186.569096390293

Theta:
[[11.5897447  -1.80749454]
 [10.03628632 -1.32984357]
 [10.82283459 -1.07217604]
 [10.45675401 -1.49235796]]

Pi:
[0.28830204 0.32508516 0.26218517 0.12442763]





Performing estimation for K =  5

Estimation  1
Maximum iterations reached, iters:  500
Log-likelihood: -18186.129864639086

Estimation  2
Log-likelihood: -18185.84508435549

Estimation  3
Maximum iterations reached, iters:  500
Log-likelihood: -18185.847110155948

Estimation  4
Log-likelihood: -18185.844821706407

Estimation  5
Maximum iterations reached, iters:  500
Log-likelihood: -18185.850465823198

Estimation  6
Log-likelihood: -18186.569115580874

Estimation  7
Log-likelihood: -18185.84485133183

Estimation  8
Log-likelihood: -18185.844768396844

Estimation  9
Log-likelihood: -18185.844835589254

Estimation  10
Log-likelihood: -18185.84486774559

The estimation is finished

Log likelihood: -18185.844768396844

Theta:
[[11.61459345 -1.82016352]
 [10.02402097 -1.32217715]
 [10.41180203 -1.4886561 ]
 [10.80977595 -1.02635241]
 [10.94016554 -1.3441822 ]]

Pi:
[0.26993184 0.3050805  0.13828967 0.20676052 0.07993748]




K,2,3,4,5
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LogL,-18224.2,-18190.8,-18186.6,-18185.8
α 1,11.218 (0.027),10.140 (0.030),11.590 (0.070),11.615 (0.081)
α 2,10.139 (0.030),11.587 (0.073),10.036 (0.073),10.024 (0.080)
α 3,,10.826 (0.070),10.823 (0.070),10.412 (0.170)
α 4,,,10.457 (0.162),10.810 (0.083)
α 5,,,,10.940 (0.278)
β 1,-1.458 (0.024),-1.373 (0.027),-1.807 (0.064),-1.820 (0.068)
β 2,-1.372 (0.027),-1.809 (0.065),-1.330 (0.047),-1.322 (0.051)
β 3,,-1.083 (0.069),-1.072 (0.068),-1.489 (0.101)
β 4,,,-1.492 (0.091),-1.026 (0.095)


\begin{tabular}{lllll}
\toprule
K &               2 &               3 &               4 &               5 \\
p    &                 &                 &                 &                 \\
\midrule
LogL &        -18224.2 &        -18190.8 &        -18186.6 &        -18185.8 \\
α 1  &  11.218 (0.027) &  10.140 (0.030) &  11.590 (0.070) &  11.615 (0.081) \\
α 2  &  10.139 (0.030) &  11.587 (0.073) &  10.036 (0.073) &  10.024 (0.080) \\
α 3  &             NaN &  10.826 (0.070) &  10.823 (0.070) &  10.412 (0.170) \\
α 4  &             NaN &             NaN &  10.457 (0.162) &  10.810 (0.083) \\
α 5  &             NaN &             NaN &             NaN &  10.940 (0.278) \\
β 1  &  -1.458 (0.024) &  -1.373 (0.027) &  -1.807 (0.064) &  -1.820 (0.068) \\
β 2  &  -1.372 (0.027) &  -1.809 (0.065) &  -1.330 (0.047) &  -1.322 (0.051) \\
β 3  &             NaN &  -1.083 (0.069) &  -1.072 (0.068) &  -1.489 (0.101) \\
β 4  &             NaN &             NaN &  -1.492 (0.091) &  -1.026 (0.095) \\
β 