# Project 3: Getting Started 

This notebook is intended to help you get off to a flying start with the cars dataset. You don't have to use this notebook and you can discard any parts you do not like, they are purely intended as a help to get started. 

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize
import estimation as est
import clogit_post as clogit


import statsmodels.formula.api as smf

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [3]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [4]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [5]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


# Set up for analysis

In [6]:
price_var = 'princ'

In [7]:
cars['logp'] = np.log(cars[price_var])

In [8]:
# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [9]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies], axis=1)

### `x_vars`: List of regressors to be used 

In [10]:
x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li'] + x_vars_dummies # <--- !!! choose your preferred variables here 
print(f'K = {len(x_vars)} variables selected.')

K = 38 variables selected.


In [11]:
print(x_vars_dummies)

['MCC', 'VW', 'alfa romeo', 'audi', 'citroen', 'daewoo', 'daf', 'fiat', 'ford', 'honda', 'hyundai', 'innocenti', 'lancia', 'mazda', 'mercedes', 'mitsubishi', 'nissan', 'opel', 'peugeot', 'renault', 'rover', 'saab', 'seat', 'skoda', 'suzuki', 'tal/hillman', 'tal/matra', 'tal/simca', 'tal/sunb', 'talbot', 'toyota', 'volvo']


In [12]:
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K))
y = np.log(cars['s'].values.reshape((N,J)))

# standardize x
# x = ((x - x.mean(0).mean(0))/(x.std(0).std(0)))
print(x.shape)

(150, 40, 38)


### Understanding the sorting 

Just to be sure that we understand the relation between the pandas dataframe and the numpy 3d array, consider the following: 

In [13]:
# let's check that we get the same row from x as we can find in the original pandas dataframe
# we'll pick the first 5 "observations"
j = 1
k = 0 
x[:5, j, k] == cars.groupby(['ma','ye']).nth(j)[x_vars[k]].head(5).values

array([ True,  True,  True,  True,  True])

In [14]:
# ... and let's check it for the 5 first cars (in the first market)
k = 0
x[0, :5, k] == cars[x_vars[k]].head(5).values
# note that with i = 3 (4th element), x[i,t,k] gives ma=1 and ye=73 (first market, fourth year)
x[3, :5, k] == cars.query('(ma == 1) & (ye == 73)')[x_vars[k]].head(5).values

array([ True,  True,  True,  True,  True])

In [15]:
# and let's print out some rows along with some labels 
obs_labs = cars[['ma', 'ye', 'type', 's']].values.reshape(N,J,4) # notice that we are extracting the values from the dataframe in the same way as we did for x

i=3 # obs. index 3 is the first market in the fourth (3+1) year, i.e. 73
print(obs_labs[i,:5,:])

i = 130 # obs. index 130 is the 5th country (130/30>4) and the 11th year (130%30 = index 10)
print(obs_labs[i,:5,:])

[[1 73 'audi 80/90' 0.0198967806548532]
 [1 73 'audi 100/200' 0.0115738123314003]
 [1 73 'citroen 2 CV 6 - 2 CV 4' 0.020470221461224]
 [1 73 'citroen GSA/GSX' 0.0231960844492545]
 [1 73 'citroen dyane' 0.0232687741289353]]
[[5 80 'alfasud' 0.0061322294468038]
 [5 80 'citroen GSA' 0.0097859984028077]
 [5 80 'fiat 127' 0.0082314207084408]
 [5 80 'fiat 131F' 0.0099803206146036]
 [5 80 'ford fiesta' 0.0781217905939526]]


... and just checking that we can find those same columns in the pandas dataframe

In [16]:
cars.query('(ma == 5) & (ye == 80) & (type == "ford fiesta")').s

5204    0.078122
Name: s, dtype: float64

# OLS Example

Let's compute the OLS estimator just to test that we can do algebra with the arrays. 

***Note:*** This particular choice of $y$ and $x$ variables might not make sense, it is just to help you get started doing algebra on these arrays. 

In [17]:
Y = y.reshape(N*J,) # Make Y 1-dimensional 
X = np.hstack([x.reshape(N*J,K), np.ones((N*J,1))]).astype(np.float64) # append a constant term and ensure type = float

In [18]:
# compute the OLS estimator 
bet = np.linalg.inv(X.T @ X) @ X.T @ Y

# print
varnames = x_vars + ['const'] # we added the constant as the K+1'th column 
pd.DataFrame({'Estimate':bet}, index=varnames)

Unnamed: 0,Estimate
logp,-0.350433
home,0.973116
cy,-0.00032
hp,-0.00974
we,0.001151
li,-0.056361
MCC,-1.176538
VW,0.159212
alfa romeo,-0.481739
audi,-0.059687


# Towards logit 

In order to work with the logit model, you have to be able to compute the utility indices, which typically take the form of some inner product of an $x$-vector and a $\theta$ vector. This is illustrated for you below. Since `x` is `(N,J,K)` (i.e. `x[i,j,:]` gives the $K$-vector of regressors for the car `j` in market-period `i`), we just have to form the matrix product `x @ theta`, and Python will do the sum over the 3rd dimension of `x`. 

In [19]:
theta0 = np.zeros((K,))
v = (x @ theta0).astype(np.float64) # how to multiply a trial value with the matrix of regressors 
np.exp(v) / np.sum(np.exp(v), 1, keepdims=True) # choice probabilities 

array([[0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       ...,
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025]])

# Conditional logit estimation



In [20]:
y = cars['s'].values.reshape((N,J))


In [21]:
res = est.estimate(clogit.q, theta0,  y, x)

tab = pd.DataFrame({v:res[v] for v in ['theta', 'se', 't']}, index= x_vars)
tab

  temp = np.multiply(y, np.log(ccp))
  df = fun(x) - f0
  temp = np.multiply(y, np.log(ccp))


Optimization terminated successfully.
         Current function value: 3.473342
         Iterations: 299
         Function evaluations: 11943
         Gradient evaluations: 306


Unnamed: 0,theta,se,t
logp,-0.161464,11.10833,-0.014535
home,1.361748,2.613309,0.521082
cy,-0.000127,0.009066,-0.014019
hp,-0.014301,0.213132,-0.067097
we,0.000509,0.01702,0.029913
li,-0.031275,1.489971,-0.02099
MCC,-1.358908,1010.736621,-0.001344
VW,0.181579,12.589559,0.014423
alfa romeo,-0.711884,18.901034,-0.037664
audi,-0.106221,18.220248,-0.00583


# Price Elasticities

In [42]:
thetahat = res['theta']
# Calculate the original choice probabilities using the estimated parameters
ccp1 = clogit.choice_prob(thetahat, x)

E_own   = np.zeros((N, J))

# Due to log price variable being the first element of x_vars, we can use k_price = 0
k_price = 0 

for j in range(J):
    # A. copy 
    x2 = x.copy()
    
    # B. increase price just for car j 
    rel_change_x = 1e-3
    x2[:, j, k_price] *= (1.0+rel_change_x)
    
    # C. evaluate CCPs.  calculate the new choice probabilities with the increased price.
    ccp2 = clogit.choice_prob(thetahat, x2)
    
    # D. percentage change in CCPs 
    rel_change_y = ccp2 / ccp1 - 1.0 
    
    # E. elasticities 
    elasticity = rel_change_y / rel_change_x 
    
    E_own[:, j] = elasticity[:, j]

print(f'Own-price elasticity:  {np.mean(E_own).round(4)}')
    

Own-price elasticity:  0.0565


In [43]:
home = 1
# Create two indexed, from where idx1 is for domestic cars
# and idx0 is for imported cars.
idx1 = x[:, :, home]==1
idx0 = x[:, :, home]==0 
print(f'Elasticity, Domestic cars:   {np.mean(E_own[idx1]).round(4)}')
print(f'Elasticity, Imported cars: {np.mean(E_own[idx0]).round(4)}')

Elasticity, Domestic cars:   0.0454
Elasticity, Imported cars: 0.0617
