Loading the data

id: household

Choices:  
- heinz41  
- heinz32  
- heinz28  
- hunts32  

variables  
- disp (1/0)  
- feat (1/0)  
- price (Log?)  

In [1]:
import numpy as np
import pandas as pd
df = pd.read_pickle("data.pkl") 
df.head()
# choices = ['heinz41', 'heinz32', 'heinz28', 'hunts32']
# df['choiceindex'] = list(map(lambda x: choices.index(x), df.choice))


Unnamed: 0,id,disp.heinz41,disp.heinz32,disp.heinz28,disp.hunts32,feat.heinz41,feat.heinz32,feat.heinz28,feat.hunts32,price.heinz41,price.heinz32,price.heinz28,price.hunts32,choice,choiceindex
0,1,0,0,0,0,0,0,0,0,4.6,3.7,5.2,3.4,heinz28,2
1,1,0,0,0,0,0,0,0,0,4.6,4.3,5.2,4.4,heinz28,2
2,1,0,0,0,0,0,1,0,0,4.6,2.5,4.6,4.8,heinz28,2
3,1,0,0,0,0,0,0,0,0,4.6,3.7,5.2,3.4,heinz28,2
4,1,0,0,0,0,0,0,1,0,4.6,3.0,4.6,4.8,heinz28,2


Defining the likelihood function using pandas directly

In [218]:
%%time
def likelihood(c, data, draws):
    def sigmoid(row):
        def utility(alpha, choice):
            x = [row[1+choice], row[5+choice], row[9+choice]]
            return np.exp(alpha +  np.dot(c[3:6],x) + np.dot(np.multiply(c[6:],draws[row.id-1]),x))
        
        choices = [
        utility(c[0], 0), # heinz41
        utility(c[1],1), #heinz32
        utility(c[2],2), #heinz28
        utility(0,3) #huts32
        ]
        
        return np.log(choices[row.choiceindex] / np.sum(choices))
    
    return data.agg(sigmoid, axis="columns").sum()


Wall time: 0 ns


Defining the likelihood function using numba and numpy arrays

In [204]:
# %%time
import time
from numba import jit, prange

@jit(nopython=True, parallel=True)
def numba_likelihood(c, data, draws):
    res = np.zeros(len(data))
    
    for i in prange(len(data)):
        row = data[i]
        
        choices = np.zeros(4)
        for j in prange(4):
            utility = 0; #start with alpha
            if j < 3: utility = c[j]
            x = [row[1+j], row[5+j], row[9+j]]
            mu =  c[3:6]
            sigma = c[6:]
            #np.exp(alpha +  np.dot(c[3:6],x) + np.dot(np.multiply(c[6:],draws[row[0]-1]),x))
            for l in range(3):
                  utility += mu[l] * x[l] + sigma[l] * draws[int(row[0])-1,l] * x[l]
                      
            choices[j] = np.exp(utility)
        
        res[i] = np.log(choices[int(row[13])] / np.sum(choices))
            
    return res.sum()


benchmarking and comparing pandas and numba implementation

In [223]:
%%time
np.random.seed(1234)
draws = np.random.randn(100, 300, 3); 


coefficients = [1, #alpha heinz41
                3, #alpha heinz32 
                2, #alpha hunts32
                2, #mu    display
                4, #mu    feat
                8, #mu    price
                3, #sigma dispay 
                1, #sigma feat 
                2, #sigma price 
                ]

# for i in range(0,100):
#     likelihood(coefficients, df, draws)

npdata = np.array(df.drop(columns='choice'))

numba_likelihood(coefficients, np.array(df.drop(columns='choice')), draws)


n=1
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
start = time.time()
for i in range(n):
    print(likelihood(coefficients, df, draws))
end = time.time()
print("Elapsed (pandas) for %d iterations = %s" % (n, (end - start)))

# # NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
# start = time.time()
# for i in range(n):
#     print(numba_likelihood(coefficients, npdata, draws))
# end = time.time()
# print("Elapsed (numba) for %d iterations = %s" % (n, (end - start)))


-30589.430422747806
Elapsed (pandas) for 1 iterations = 0.5600981712341309
-30589.4304227477
Elapsed (numba) for 1 iterations = 0.0
Wall time: 577 ms
