Loading the data

id: household

Choices:  
- heinz41  
- heinz32  
- heinz28  
- hunts32  

variables  
- disp (1/0)  
- feat (1/0)  
- price (Log?)  

In [1]:
import numpy as np
import pandas as pd
df = pd.read_pickle("data.pkl") 
df.head()
# choices = ['heinz41', 'heinz32', 'heinz28', 'hunts32']
# df['choiceindex'] = list(map(lambda x: choices.index(x), df.choice))


Unnamed: 0,id,disp.heinz41,disp.heinz32,disp.heinz28,disp.hunts32,feat.heinz41,feat.heinz32,feat.heinz28,feat.hunts32,price.heinz41,price.heinz32,price.heinz28,price.hunts32,choice,choiceindex
0,1,0,0,0,0,0,0,0,0,4.6,3.7,5.2,3.4,heinz28,2
1,1,0,0,0,0,0,0,0,0,4.6,4.3,5.2,4.4,heinz28,2
2,1,0,0,0,0,0,1,0,0,4.6,2.5,4.6,4.8,heinz28,2
3,1,0,0,0,0,0,0,0,0,4.6,3.7,5.2,3.4,heinz28,2
4,1,0,0,0,0,0,0,1,0,4.6,3.0,4.6,4.8,heinz28,2


Defining the likelihood function using numba and numpy arrays and multiple draws

In [8]:
from numba import jit, prange

@jit(nopython=True, parallel=True)
def likelihood(c, data, draws, verbose=False):
    #print("evaluation likelihood")
    n_r = draws.shape[0]
    n_q = draws.shape[1]
#   n_k = draws.shape[2]
    
    #matrix with all simulations
    simulations = np.zeros((n_q, n_r))
    
    #iterate over households
    for q in prange(n_q):
        rows = data[np.where(data[:,0] == q+1)]
        n_rows = len(rows)
        
        #iterate over draws per household
        for r in prange(n_r):
            probabilities= np.zeros(n_rows)
            
            #iterate over oberservations per househould
            for t in prange(n_rows):
                choices = np.zeros(4)
                
                #itetate over probability of choices per observation
                for j in prange(4):
                    utility = 0; #start with alpha
                    if j < 3: utility = c[j]
                    x = [rows[t][1+j], rows[t][5+j], rows[t][9+j]]
                    mu =  c[3:6]
                    sigma = c[6:]
                    #np.exp(alpha +  np.dot(c[3:6],x) + np.dot(np.multiply(c[6:],draws[row[0]-1]),x))
                    for l in prange(3):
                          utility += mu[l] * x[l] + sigma[l] * draws[r][q][l] * x[l]

                    choices[j] = np.exp(utility)
            
                probabilities[t] = choices[int(rows[t][13])] / np.sum(choices)
                
            simulations[q,r] = np.exp(np.log(probabilities).sum())
            
    estimates = np.zeros(n_q)
    for q in prange(n_q):
        estimates[q] = np.sum(simulations[q,:]) / n_r
    res = -np.log(estimates).sum()
    if verbose: print(res)
    return res



benchmarking and comparing pandas and numba implementation

In [21]:
import time
# np.random.seed(1234)
n = 1
draws = np.random.randn(500, 300, 3); 

coefficients = [1, #alpha heinz41
                3, #alpha heinz32 
                2, #alpha hunts32
                2, #mu    display
                4, #mu    feat
                8, #mu    price
                3, #sigma dispay 
                1, #sigma feat 
                2, #sigma price 
                ]
# print(likelihood(coefficients, df.drop(columns='choice').values, draws))


# # DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
# start = time.time()
# print(likelihood(coefficients, df.drop(columns='choice').values, draws))
# end = time.time()
# print("Elapsed (first run) = %s" % (end - start))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
start = time.time()
for i in range(n):
    print(likelihood(coefficients, df.drop(columns='choice').values, draws))
end = time.time()
print("Elapsed (compiled) for %d iterations = %s" % (n, (end - start)))
print((end - start)/n)

8713.191274475854
Elapsed (compiled) for 1 iterations = 2.9022457599639893
2.9022457599639893


Optimize the parameters

In [18]:
from scipy.optimize import minimize

np.random.seed(1234)
draws = np.random.randn(100, 300, 3); 

coefficients = [1, #alpha heinz41
                3, #alpha heinz32 
                2, #alpha hunts32
                2, #mu    display
                4, #mu    feat
                8, #mu    price
                3, #sigma dispay 
                1, #sigma feat 
                2, #sigma price 
                ]
res = minimize(likelihood, coefficients, args=(df.drop(columns='choice').values, draws, False), method='BFGS')

In [17]:
res

      fun: 2514.8416450681693
 hess_inv: array([[ 0.00389356,  0.00030789, -0.00205455,  0.00135123,  0.00129012,
         0.00066497, -0.00214754,  0.00083673, -0.00024953],
       [ 0.00030789,  0.00419057, -0.0031861 ,  0.0019029 ,  0.00612043,
         0.00421966, -0.00233658,  0.00088101, -0.00018094],
       [-0.00205455, -0.0031861 ,  0.00496448, -0.00221899, -0.00853704,
        -0.00448319,  0.00308346, -0.00241269, -0.00016274],
       [ 0.00135123,  0.0019029 , -0.00221899,  0.00251095,  0.00300672,
         0.00202024, -0.00213545,  0.00023689, -0.00050022],
       [ 0.00129012,  0.00612043, -0.00853704,  0.00300672,  0.01820571,
         0.00819528, -0.0051413 ,  0.00509624,  0.0009581 ],
       [ 0.00066497,  0.00421966, -0.00448319,  0.00202024,  0.00819528,
         0.00631077, -0.00279672,  0.00187061, -0.00016232],
       [-0.00214754, -0.00233658,  0.00308346, -0.00213545, -0.0051413 ,
        -0.00279672,  0.003162  , -0.0013309 , -0.00014097],
       [ 0.00083673, 

In [19]:
res

      fun: 2298.2193386202657
 hess_inv: array([[ 1.26431723e-02,  2.07263799e-03,  6.36368277e-03,
        -1.40962710e-03, -4.60567245e-03, -4.48184977e-03,
        -4.68466899e-03,  1.78027217e-03, -6.41547579e-04],
       [ 2.07263799e-03,  3.41190041e-03,  2.51218344e-03,
        -7.23017520e-04, -2.22334216e-03, -6.22038206e-04,
        -3.99799023e-03,  8.99831064e-04,  1.16075922e-07],
       [ 6.36368277e-03,  2.51218344e-03,  7.91808126e-03,
        -8.74254649e-04, -6.44476363e-03, -5.09243712e-03,
        -7.63268753e-04,  4.34257960e-03,  7.23570363e-04],
       [-1.40962710e-03, -7.23017520e-04, -8.74254649e-04,
         8.47303098e-03, -1.65001177e-03, -7.16787263e-04,
        -1.81736553e-03, -9.01279026e-04,  1.67484346e-04],
       [-4.60567245e-03, -2.22334216e-03, -6.44476363e-03,
        -1.65001177e-03,  1.76299579e-02,  4.75144726e-03,
        -1.62653364e-02,  2.13383493e-04, -2.64783982e-03],
       [-4.48184977e-03, -6.22038206e-04, -5.09243712e-03,
        -7