# Project 3: Getting Started 

This notebook is intended to help you get off to a flying start with the cars dataset. You don't have to use this notebook and you can discard any parts you do not like, they are purely intended as a help to get started. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize

import clogit as clogit
import estimate as est

import statsmodels.formula.api as smf

%load_ext autoreload
%autoreload 2

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [3]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


In [4]:
# Which brands have home advantage in some market?

all_brands = cars['brand'].unique()
home_brands = cars.loc[cars['home'] == 1, 'brand'].unique()
not_home_brands = np.setdiff1d(all_brands, home_brands)

print(f'The following brands are based in one of the five countries:\n{home_brands}\n\nThe following are not:\n{not_home_brands}')

The following brands are based in one of the five countries:
['citroen' 'peugeot' 'renault' 'tal/simca' 'talbot' 'tal/matra' 'audi'
 'ford' 'mercedes' 'opel' 'BMW' 'VW' 'MCC' 'fiat' 'innocenti' 'lancia'
 'alfa romeo' 'rover' 'tal/sunb' 'tal/hillman']

The following are not:
['daewoo' 'daf' 'honda' 'hyundai' 'mazda' 'mitsubishi' 'nissan' 'saab'
 'seat' 'skoda' 'suzuki' 'toyota' 'volvo']


# Set up for analysis

In [5]:
price_var = 'princ'
cars['logp'] = np.log(cars[price_var])

# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars['logp'] * cars['home']  

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [6]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

categorical_var1 = 'cla' # name of categorical variable
dummies1 = pd.get_dummies(cars[categorical_var1]) # creates a matrix of dummies for each value of dummyvar
dummies1.columns = dummies1.columns.astype(str) # convert to string (names of these columns are numbers as it is)
x_vars_dummies += list(dummies1.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])
 
# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies, dummies1], axis=1) # put the dummies in the dataframe

### Scaling of non-dummy control-variables

In [7]:
# scaling non-dummy control variables to be close to 1.0

contr_vars = ['cy', 'hp', 'we', 'li'] # List of non-dummy control variables
x_contr= cars[contr_vars].copy()
print(np.mean(x_contr,axis=0))
x_contr = (x_contr)/np.mean(x_contr,axis=0)
print(np.mean(x_contr,axis=0))
cars[contr_vars] = x_contr

cy    1337.090417
hp      50.100017
we     934.488833
li       7.873775
dtype: float64
cy    1.0
hp    1.0
we    1.0
li    1.0
dtype: float64


### `x_vars`: List of regressors to be used 

In [8]:
cars

Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,tal/simca,tal/sunb,talbot,toyota,volvo,1,2,3,4,5
0,70,1,15,14,2,audi 100/200,audi,100/200,2,4,...,False,False,False,False,False,False,False,False,True,False
1,70,1,26,35,4,citroen 2 CV 6 - 2 CV 4,citroen,2CV6,1,3,...,False,False,False,False,False,True,False,False,False,False
2,70,1,36,36,4,citroen dyane,citroen,dyane,1,3,...,False,False,False,False,False,True,False,False,False,False
3,70,1,64,67,7,fiat 128,fiat,128,3,5,...,False,False,False,False,False,True,False,False,False,False
4,70,1,71,80,8,ford escort,ford,escort,2,4,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,99,5,914,161,16,mercedes A,mercedes,A,2,4,...,False,False,False,False,False,False,True,False,False,False
5996,99,5,936,80,8,ford focus,ford,focus,7,4,...,False,False,False,False,False,False,True,False,False,False
5997,99,5,951,208,20,peugeot 206,peugeot,206,1,3,...,False,False,False,False,False,True,False,False,False,False
5998,99,5,953,282,29,toyota avensis,toyota,avensis,4,12,...,False,False,False,True,False,False,False,True,False,False


In [9]:
core_vars = ['logp','home','logp_x_home'] # main x-variables for analysis 'logp_x_home'
x_vars = np.concatenate((core_vars,contr_vars,x_vars_dummies))
x_vars_nodum = np.concatenate((core_vars,contr_vars))
x_vars_core = core_vars.copy()
print(f'K = {len(x_vars)} variables selected.')

N = cars.ma.nunique() * cars.ye.nunique() 
J = 40 
K = len(x_vars)
K_nodum = len(x_vars_nodum)
K_core = len(x_vars_core)

# reshape into x- and y-variables
x = cars[x_vars].values.reshape((N,J,K)).astype(np.float64).copy()
x_nodum = cars[x_vars_nodum].values.reshape((N,J,K_nodum)).astype(np.float64).copy()
x_core = cars[x_vars_core].values.reshape((N,J,K_core)).astype(np.float64).copy()
y = cars['s'].values.reshape((N,J)).copy()

x.shape # 150 markets x 40 models x 43 variables 


K = 43 variables selected.


(150, 40, 43)

# Estimate parameters

In [10]:
theta0 = clogit.starting_values(y,x)
theta0_nodum = clogit.starting_values(y,x_nodum)
theta0_core = clogit.starting_values(y,x_core)

In [11]:
res = est.estimate(clogit.q, theta0, y, x,'Sandwich')

Optimization terminated successfully.
         Current function value: 3.470836
         Iterations: 315
         Function evaluations: 13904
         Gradient evaluations: 316


In [12]:
res_nodum = est.estimate(clogit.q, theta0_nodum, y, x_nodum,'Sandwich')

Optimization terminated successfully.
         Current function value: 3.501097
         Iterations: 49
         Function evaluations: 400
         Gradient evaluations: 50


In [13]:
res_core = est.estimate(clogit.q, theta0_core, y, x_core,'Sandwich')

Optimization terminated successfully.
         Current function value: 3.513370
         Iterations: 15
         Function evaluations: 64
         Gradient evaluations: 16


In [14]:
tab = pd.DataFrame({v:res[v] for v in ['theta', 'se', 't']}, index=x_vars)
tab

Unnamed: 0,theta,se,t
logp,-0.45518,0.140788,-3.233081
home,1.428441,0.04555,31.360137
logp_x_home,0.147185,0.062157,2.367937
cy,-0.074788,0.155736,-0.480223
hp,-0.693474,0.125323,-5.533477
we,0.482975,0.189099,2.55409
li,-0.356829,0.128193,-2.78353
MCC,-1.412699,0.110888,-12.739879
VW,0.034861,0.189776,0.183697
alfa romeo,-0.887584,0.152294,-5.828079


In [27]:
tab_nodum = pd.DataFrame({v:res_nodum[v] for v in ['theta', 'se', 't']}, index=x_vars_nodum)
tab_nodum.round(3)

Unnamed: 0,theta,se,t
logp,0.294,0.107,2.749
home,1.222,0.039,31.009
logp_x_home,-0.196,0.055,-3.596
cy,0.316,0.121,2.6
hp,-1.347,0.106,-12.755
we,0.414,0.144,2.877
li,-0.1,0.169,-0.591


In [28]:
tab_core = pd.DataFrame({v:res_core[v] for v in ['theta', 'se', 't']}, index=x_vars_core)
tab_core.round(3)

Unnamed: 0,theta,se,t
logp,-0.482,0.042,-11.472
home,1.152,0.035,32.767
logp_x_home,-0.404,0.046,-8.792


In [17]:
# For now continue with the full model
thetahat = res['theta']

## Price elasticities 


### Calculating elasticities incl. standard errors using the delta-method

In [18]:
def E(x,theta):
    N,J,K = x.shape

    E_own   = np.zeros((N, J))
    E_cross = np.zeros((N, J))
    
    for j in range(J):
        # A. copy
        x2 = x.copy()

        # B. change price 1 percent, remember price is in logs, so 1 percent change in price is 0.01
        rel_change_x = 0.01
        x2[:,j,0] = x[:,j,0] + rel_change_x
        x2[:,j,2] = x[:,j,2] + rel_change_x*x[:,j,1]

        # C. evaluate market shares
        ms1 = clogit.choice_prob(theta, x)
        ms2 = clogit.choice_prob(theta, x2)

        # D. compute elasticities
        rel_change_y = (ms2-ms1)/ms1 

        # E. elasticities
        elasticity = rel_change_y/rel_change_x
        
        E_own[:,j] = elasticity[:,j]
        k_not_j = [k for k in range(J) if k != j] # indices for all other cars than j 
        E_cross[:,j]  = np.mean(elasticity[:,k_not_j]) # Avg. among the cars k_not_j

        _E_own = np.mean(E_own)
        _E_cross = np.mean(E_cross)

    return E_own, E_cross

In [19]:
# Calculate variance of elasticity by the delta-method

h = lambda theta: np.mean(E(x,theta)[0])
grad = est.centered_grad(h,thetahat)

Avar_theta = res['cov']
Avar_E = grad@Avar_theta@grad.T

se_E_own = np.sqrt(Avar_E)

se_E_own

array([[0.13742895]])

In [20]:
# Calculate variance of elasticity by the delta-method

h = lambda theta: np.mean(E(x,theta)[1])
grad = est.centered_grad(h,thetahat)

Avar_theta = res['cov']
Avar_E = grad@Avar_theta@grad.T

se_E_cross = np.sqrt(Avar_E)

se_E_cross

array([[0.0035862]])

In [21]:
E_own = np.mean(E(x,thetahat)[0])
E_cross = np.mean(E(x,thetahat)[1])
print(f'Average own-price elasticity:  {E_own.round(5)}')
print(f'average cross-price elasticity: {E_cross.round(5)}')

Average own-price elasticity:  -0.39847
average cross-price elasticity: 0.00933


In [22]:
home_mask = (x[:, :, 1] == 1)
foreign_mask = (x[:, :, 1] == 0)

In [23]:
# elasticities contingent on home status

E_home = np.mean(E(x,thetahat)[0][home_mask])
E_foreign = np.mean(E(x,thetahat)[0][foreign_mask])
print(f'Average own-price elasticity (home):  {E_home.round(5)}')
print(f'Average own-price elasticity (foreign): {E_foreign.round(5)}')

# prepare gradient calculation

def _h(x,theta):
    _E = E(x,theta)[0]
    Elast = np.mean(_E[home_mask])

    return Elast

def _f(x,theta):
    _E = E(x,theta)[0]
    Elast = np.mean(_E[foreign_mask])

    return Elast

# Calculate variance of elasticity for home vehicles by the delta-method
h = lambda theta: _h(x,theta)
grad = est.centered_grad(h,thetahat)

Avar_theta = res['cov']
Avar_E = grad@Avar_theta@grad.T

se_E_home = np.sqrt(Avar_E)

# Calculate variance of elasticity for foreign vehicles by the delta-method
h = lambda theta: _f(x,theta)
grad = est.centered_grad(h,thetahat)

Avar_theta = res['cov']
Avar_E = grad@Avar_theta@grad.T

se_E_foreign = np.sqrt(Avar_E)

print(f'SE for home is {se_E_home}, and SE for foreign is {se_E_foreign}')
    

Average own-price elasticity (home):  -0.29413
Average own-price elasticity (foreign): -0.44675
SE for home is [[0.14507087]], and SE for foreign is [[0.13787314]]


## Value of home bias

In [24]:
dp = -thetahat[1]/thetahat[0]-thetahat[2]/thetahat[0]*np.mean(x[:,:,0])
dp

3.0215793298859124

In [25]:
print(f'Prices are on average {np.mean(x[:,:,0]).round(3)}, with the max price being {np.max(x[:,:,0]).round(3)} and the min price at {np.min(x[:,:,0]).round(3)}')

Prices are on average -0.361, with the max price being 1.088 and the min price at -1.324


In [26]:
stop

NameError: name 'stop' is not defined

In [None]:
def s_err(x_home,theta,s_home,j,p):
    
    x_tilde = x_home.copy()
    x_tilde[j,0] = p # counterfactually change price of home cars
    x_tilde[j,1] = x_tilde[j,2] = 0.0 #counterfactually change to foreign

    x_tilde = np.array([x_tilde]) # give dimension N=1
    
    s_tilde = clogit.choice_prob(theta,x_tilde) #calculate counterfactual market shares
    s_tilde = s_tilde[0,j]#pick out relevant model

    return (s_home - s_tilde)**2 #calculate squared error

In [None]:
p = np.zeros(J) #counterfactual price
vhb = np.zeros(J) # value of home bias

sales = cars['qu_tot'].values.reshape((N,J)).astype(np.float64).copy()
pop = cars['pop'].values.reshape((N,J)).astype(np.float64).copy()

for j in range(J):

    # Create a boolean mask for observations where 'home' is equal to 1
    _home_mask = (x[:, j, 1] == 1)

    # Use boolean indexing to select the subset of observations
    _x_home = x[_home_mask,:,:].copy() #subset of x for which car j is home
    _s_home = clogit.choice_prob(thetahat,_x_home)

    # markets over which to find value of home bias
    dim = _s_home[:,0].shape
    _p_i = np.zeros(dim[-1])

    for i in np.arange(dim[-1]):

        _p0 = _x_home[i,j,0] # initial guess is unchanged price
        _fun = lambda p: s_err(_x_home[i,:,:],thetahat,_s_home[i,j],j,p)

        res = optimize.minimize(_fun,_p0)

        _p_i[i] = res.x

    p[j] = np.mean(_p_i[:])
    vhb[j] = (np.mean(_x_home[:,j,0] - _p_i[:]))#*np.mean(sales[:,j])

vhb 


array([3.09464254, 3.09040734, 3.0388813 , 3.02072197, 2.988898  ,
       3.01404068, 3.01659285, 3.03272009, 3.01375118, 3.01469281,
       3.01915742, 3.03665446, 3.05680876, 3.04731029, 3.01885868,
       3.05907334, 3.05306355, 3.06206039, 3.02007743, 3.0712614 ,
       3.07540363, 3.08639037, 3.02901644, 3.01117339, 3.03132912,
       3.0198503 , 3.00277845, 3.02020896, 3.03760279, 3.04331265,
       3.06399644, 3.08048371, 3.08792948, 3.04732611, 2.93348218,
       2.98989967, 2.96637384, 2.99234637, 2.98885347, 2.75353317])