In [1]:
import numpy as np
import pandas as pd
from numba import njit
from scipy.stats import norm
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('data450759.csv', sep=',')
df.head(10)

Unnamed: 0.1,Unnamed: 0,ID,Sales,Price
0,1-1,1,8436,1.19
1,2-1,1,9598,1.72
2,3-1,1,18576,1.96
3,4-1,1,15499,2.04
4,5-1,1,14176,1.73
5,6-1,1,1139,2.28
6,7-1,1,8364,5.24
7,8-1,1,613,4.58
8,9-1,1,582,2.3
9,10-1,1,15494,1.8


In [3]:
df['ID'][df['ID'] == 3].count()

25

In [4]:
df['Constant'] = pd.Series(np.ones(len(df)))
df['Time'] = pd.Series([i%25+1 for i in range(0,len(df))])
df['log Sales'] = np.log(df['Sales'])
df['log Price'] = np.log(df['Price'])
df.head(30)

Unnamed: 0.1,Unnamed: 0,ID,Sales,Price,Constant,Time,log Sales,log Price
0,1-1,1,8436,1.19,1.0,1,9.040264,0.173953
1,2-1,1,9598,1.72,1.0,2,9.16931,0.542324
2,3-1,1,18576,1.96,1.0,3,9.829626,0.672944
3,4-1,1,15499,2.04,1.0,4,9.648531,0.71295
4,5-1,1,14176,1.73,1.0,5,9.559306,0.548121
5,6-1,1,1139,2.28,1.0,6,7.037906,0.824175
6,7-1,1,8364,5.24,1.0,7,9.031692,1.656321
7,8-1,1,613,4.58,1.0,8,6.418365,1.521699
8,9-1,1,582,2.3,1.0,9,6.36647,0.832909
9,10-1,1,15494,1.8,1.0,10,9.648208,0.587787


In [5]:
df = df.drop(['Sales','Unnamed: 0', 'Price'], axis =1)
df.head()

Unnamed: 0,ID,Constant,Time,log Sales,log Price
0,1,1.0,1,9.040264,0.173953
1,1,1.0,2,9.16931,0.542324
2,1,1.0,3,9.829626,0.672944
3,1,1.0,4,9.648531,0.71295
4,1,1.0,5,9.559306,0.548121


In [6]:
y = np.zeros((500,25))
X = df[['Constant', 'log Price']].iloc[:25].values
for i in range(1, 501):
    y[i-1,:] = df['log Sales'].iloc[(i-1)*25:i*25]

In [7]:
def thetashape(theta):
    #theta is 1xK*2 [alpha_1, beta_1,..,alpha_k,beta_k]
    #return 2xK, each column corresponds to a cluster
    K = len(theta)//2
    t = np.zeros((2,K))
    for i in range(K):
        t[0,i] = theta[i*2]
        t[1,i] = theta[i*2+1]
    return t        

In [8]:
def logl(theta,pi,y,X):
    #y: NxT of log sales
    #X: Tx2 constant and price
    #theta: alpha_c and beta_c for each cluster
    #pi: probabilities that x_i is in cluster c
    N = len(y)
    T = len(X)
    K = len(pi)
    l = 0
    theta = thetashape(theta)
    for i in range(N):
            y_i = np.prod(norm.pdf(y[i,:][:,None]-X@theta),axis=0)
#             print(y[i,:][:,None]-X@theta)
#             print(y_i)
            l += np.log(np.sum(pi*y_i))
    return l
theta = np.ones(3*2)*4
pi = np.ones(3)/3
print(logl(theta,pi,y,X))

-88885.64029539793


In [33]:
def Estep(theta,pi,y,X):
    #y: NxT of log sales
    #X: Tx2 constant and price
    #theta: alpha_c and beta_c for each cluster
    #pi: probabilities that x_i is in cluster c
    #return cluster probabilities
    N = len(y)
    T = len(X)
    K = len(pi)
    W = np.zeros((N,K))
    theta = thetashape(theta)
    for i in range(N):
            y_i = np.prod(norm.pdf(y[i,:][:,None]-X@theta), axis=0)
            W[i,:] = (pi * y_i) / np.sum(pi * y_i)
            if np.any(np.isnan(W)):
                print('the matrix of conditional cluster probabilities contains NaNs')
                print(W[i,:])
                break
    return W
(Estep(np.array([9.43, 9.43, 8.52, 3.11, 0, 9]), np.array([0.2,0.3,0.5]), y, X))

array([[0.00000000e+000, 1.00000000e+000, 3.97478511e-080],
       [0.00000000e+000, 1.00000000e+000, 1.55033123e-128],
       [0.00000000e+000, 1.00000000e+000, 4.59548812e-094],
       ...,
       [0.00000000e+000, 1.00000000e+000, 7.78168756e-134],
       [0.00000000e+000, 1.00000000e+000, 1.07535301e-090],
       [0.00000000e+000, 1.00000000e+000, 1.31383666e-080]])

In [18]:
def Mstep(W, y, X, theta):
    #W: cluster probabilities (N,K)
    #return new theta and pi
    T = len(X)
    N = len(y)
    K = W.shape[1]
    theta = thetashape(theta)
    alpha = np.sum(y,axis=1).T@W/(T*np.sum(W,axis=0)) - theta[1,:]*np.sum(X[:,1])/T
    beta = np.sum(y*X[:,1],axis=1).T@W/(np.sum(W,axis=0)*np.sum(X[:,1])) - T*theta[0,:]/np.sum(X[:,1])
    if np.all(np.sum(W,axis = 0)) == 0:
        print('NAN, W[:,c] contains all zeros and thus alpha contains NaN')
    theta = np.zeros(K*2)
    for c in range(K):
        theta[c*2] = alpha[c]
        theta[c*2+1] = beta[c]
    pi = np.mean(W, axis = 0)
    return theta, pi
    

In [42]:
def EM(theta,pi,y,X, tolerance):
    prev = 0
    i = 0
    prev_pi = np.inf
    check = 0
    while check < tolerance and np.isnan(logl(theta, pi, y, X)) == False :
        prev_pi = pi
        W = Estep(theta,pi,y,X)
        theta, pi = Mstep(W,y,X,theta)
        if np.sum(np.abs(prev_pi - pi))  == 0:
            check +=1
        else:
            check = 0
        print(logl(theta, pi, y, X))
        print(pi)
        i += 1
    print('iterations: %i' %(i))
    return theta, pi 

In [44]:
%%time
def estimate(K):
    km = KMeans(n_clusters = K, random_state=0).fit(y)
    labels = km.labels_
    pi_1 = len(labels[labels == 0])/500
    pi_2 = len(labels[labels == 1])/500
    pi_3 = 1 - pi_2 - pi_1
    pi = np.array([pi_1,pi_2,pi_3])
    theta = np.ones(2*K)*6
    tolerance = 2
    theta, pi = EM(theta,pi, y, X, tolerance)
    print(logl(theta,pi,y,X))
    return theta, pi
estimate(3)

-109522.45668570836
[0.246 0.43  0.324]
-105930.43404545468
[0.246 0.43  0.324]
-95163.5516497543
[0.246 0.43  0.324]
-81593.3520674756
[0.246 0.43  0.324]
-86913.74116978902
[0.24599999 0.43000001 0.324     ]
-63365.36464548362
[0.2460001  0.42999991 0.324     ]
-84773.02524531833
[0.24599904 0.43000092 0.32400003]
-51246.47173970817
[0.24601658 0.42998398 0.32399943]
-88741.38991652515
[0.24586184 0.43013347 0.32400469]
-45235.82637627904
[0.24881932 0.42728457 0.3238961 ]
-98403.73003267814
[0.22676469 0.4489816  0.32425371]
-38619.72196087165
[0.77887393 0.07393387 0.1471922 ]
-60562.534953245384
[4.25901893e-05 9.79479183e-01 2.04782273e-02]
-34854.454971300154
[9.98537837e-01 6.32510454e-20 1.46216268e-03]
-73063.39528748461
[9.83648912e-01 1.76439597e-17 1.63510883e-02]
-51292.67238511624
[1.00000000e+00 9.16044995e-32 5.64576939e-12]
-97481.70639381533
[1.00000000e+00 3.02344370e-31 1.19715915e-11]
-73797.6882653818
[1.00000000e+00 3.28189627e-49 1.00366859e-27]
-128017.3551779

  
  if __name__ == '__main__':
  return (a <= x) & (x <= b)
  return (a <= x) & (x <= b)


nan
[1. 0. 0.]
iterations: 26
nan
Wall time: 6.31 s


(array([ 21.15866291, -12.4394273 ,          nan,          nan,
                 nan,          nan]), array([1., 0., 0.]))