In [1]:
# import packages we'll use here
import pandas as pd
import numpy as np
import scipy.optimize as opt
import time
from scipy.optimize import differential_evolution
from geopy.distance import vincenty as vc

In [2]:
# Read in the data
ps4_data = pd.read_excel('radio_merger_data.xlsx')

In [3]:
ps4_data.describe()

Unnamed: 0,year,buyer_id,target_id,buyer_lat,buyer_long,target_lat,target_long,price,hhi_target,num_stations_buyer,population_target,corp_owner_buyer
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,2007.545455,25.454545,25.454545,36.909475,-94.392918,37.476019,-94.365015,4141370.0,96.353535,18.828283,322958.8,0.020202
std,0.500464,14.70962,14.70962,5.068201,13.957259,4.853933,13.695796,14712550.0,82.242463,69.837719,1098850.0,0.141407
min,2007.0,1.0,1.0,25.558428,-122.7106,25.369355,-122.43342,57240.72,10.0,0.0,2553.0,0.0
25%,2007.0,13.0,13.0,32.886576,-104.49768,33.687825,-104.876615,491358.4,54.5,1.0,21698.0,0.0
50%,2008.0,25.0,25.0,36.653256,-92.662675,36.900901,-92.122592,1013110.0,72.0,2.0,49948.0,0.0
75%,2008.0,37.5,37.5,40.85879,-83.813483,41.145355,-84.212299,2330153.0,91.0,5.5,153855.0,0.0
max,2008.0,54.0,54.0,48.704839,-71.4115,48.906401,-70.018443,139300000.0,388.0,591.0,9818605.0,1.0


In [4]:
# Scale variables
ps4_data['pop_ths_log'] = np.log(ps4_data['population_target'] / 1000)
ps4_data['price_ths_log'] = np.log(ps4_data['price'] / 1000)
ps4_data['num_stations_log'] = np.log(1 + ps4_data['num_stations_buyer'])
ps4_data['hhi_log'] = np.log(ps4_data['hhi_target'])

In [5]:
def distance_calc1 (row):
    start = (row['buyer_lat'], row['buyer_long'])
    stop = (row['target_lat'], row['target_long'])
    return np.log(vc(start, stop).miles)

In [6]:
# Calculate the variables for observed matches
ps4_data['var1'] = ps4_data['num_stations_log'] * ps4_data['pop_ths_log']
ps4_data['var2'] = ps4_data['corp_owner_buyer'] * ps4_data['pop_ths_log']
ps4_data['var3'] = ps4_data.apply (lambda row: distance_calc1 (row),axis = 1)

In [7]:
# Create dataframes for different years
ps4_data_2007 = ps4_data[(ps4_data['year'] == 2007)].copy()
ps4_data_2007['index'] = ps4_data_2007['buyer_id'] - 1 # This creates my own index to fix the indexing/location problem
ps4_data_2007 = ps4_data_2007.set_index('index')
ps4_data_2008 = ps4_data[(ps4_data['year'] == 2008)].copy()
ps4_data_2008['index'] = ps4_data_2008['buyer_id'] - 1
ps4_data_2008 = ps4_data_2008.set_index('index')

In [8]:
# Define a function to calculate the distance
def distance_calc (data,row1,row2):
    start = (data.iloc[row1, 3], data.iloc[row1, 4])
    stop = (data.iloc[row2, 5], data.iloc[row2, 6])
    return np.log(vc(start, stop).miles)

In [9]:
def payoff(data):
    # Define some arrays to store the output numbers
    np_temp1 = np.zeros(10, dtype=np.int).reshape(1,10)
    np_temp2 = np.zeros(5, dtype=np.int).reshape(1,5)
    np_temp3 = np.zeros(5, dtype=np.int).reshape(1,5)
    for b in data['buyer_id']:
        for t in data['target_id']:
            if b < t:
                ob1 = data['var1'][b - 1]
                ob2 = data['var2'][b - 1]
                ob3 = data['var3'][b - 1]
                ob4 = data['var1'][t - 1]            
                ob5 = data['var2'][t - 1]
                ob6 = data['var3'][t - 1]
                
                ob7 = data['hhi_log'][b - 1]
                ob8 = data['price_ths_log'][b - 1]
                ob9 = data['hhi_log'][t - 1]
                ob10 = data['price_ths_log'][t - 1]
                np_temp1 = np.vstack([np_temp1, [ob1, ob2, ob3, ob4, ob5, ob6, ob7, ob8, ob9, ob10]])
                # This returns the six variables on the left hand side of the inequalities (observed matches)
                
                cf1 = data['num_stations_log'][b - 1] * data['pop_ths_log'][t - 1]
                cf2 = data['corp_owner_buyer'][b - 1] * data['pop_ths_log'][t - 1]
                cf3 = distance_calc(data, b-1, t-1)
                
                cf7 = data['hhi_log'][t - 1]
                cf8 = data['price_ths_log'][t - 1]
                np_temp2 = np.vstack([np_temp2, [cf1, cf2, cf3, cf7, cf8]])
                # This returns the three variables of the first part of the right hand side (counterfatual matches)
               
            if b > t:
                cf4 = data['num_stations_log'][b - 1] * data['pop_ths_log'][t - 1]
                cf5 = data['corp_owner_buyer'][b - 1] * data['pop_ths_log'][t - 1]
                cf6 = distance_calc(data, b-1, t-1)
                
                cf9 = data['hhi_log'][t - 1]
                cf10 = data['price_ths_log'][t - 1]
                np_temp3 = np.vstack([np_temp3, [cf4, cf5, cf6, cf9, cf10]])
                # This returns the other three variables of the second part of the right hand side (counterfactual matches)

    # Drop the first row of the array
    np_temp1 = np.delete(np_temp1, 0, 0)
    np_temp2 = np.delete(np_temp2, 0, 0)
    np_temp3 = np.delete(np_temp3, 0, 0)
    # Combine all the variables (stored in arrays) to one dataframe
    ps4_mse = pd.DataFrame({'ob1':np_temp1[:,0], 'ob2':np_temp1[:,1], 'ob3':np_temp1[:,2], 'ob4':np_temp1[:,3], 'ob5':np_temp1[:,4],
                            'ob6':np_temp1[:,5], 'ob7':np_temp1[:,6], 'ob8':np_temp1[:,7], 'ob9':np_temp1[:,8], 'ob10':np_temp1[:,9],
                            'cf1':np_temp2[:,0], 'cf2':np_temp2[:,1], 'cf3':np_temp2[:,2], 'cf7':np_temp2[:,3], 'cf8':np_temp2[:,4],
                            'cf4':np_temp3[:,0], 'cf5':np_temp3[:,1], 'cf6':np_temp3[:,2], 'cf9':np_temp3[:,3], 'cf10':np_temp3[:,4]})

    return ps4_mse

In [10]:
# Append the two dataframes together
ps4_mse_2007 = payoff(ps4_data_2007)
ps4_mse_2008 = payoff(ps4_data_2008)
together = [ps4_mse_2007, ps4_mse_2008]
ps4_mse_both = pd.concat(together, ignore_index=True)

In [11]:
ps4_mse_both.describe()

Unnamed: 0,cf1,cf10,cf2,cf3,cf4,cf5,cf6,cf7,cf8,cf9,ob1,ob10,ob2,ob3,ob4,ob5,ob6,ob7,ob8,ob9
count,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0,2421.0
mean,7.253673,7.039892,0.145376,6.694385,5.769424,0.0376,6.726361,4.35306,7.036103,4.320385,7.598311,7.036103,0.172585,4.63778,5.970846,0.041945,4.502906,4.320385,7.039892,4.35306
std,6.712584,1.453306,0.807183,0.708819,5.547556,0.43868,0.736627,0.624195,1.226876,0.663254,8.114762,1.226876,0.939508,1.724544,6.227313,0.465693,1.777316,0.663254,1.453306,0.624195
min,0.0,4.047266,0.0,2.652507,0.0,0.0,0.290254,2.302585,4.047266,2.302585,0.0,4.047266,0.0,0.290254,0.0,0.0,0.290254,2.302585,4.047266,2.302585
25%,2.88837,6.165758,0.0,6.300884,2.448048,0.0,6.347827,4.043051,6.227633,3.931826,2.489868,6.227633,0.0,3.397824,2.255482,0.0,3.128433,3.931826,6.165758,4.043051
50%,5.226165,6.852859,0.0,6.80292,4.28492,0.0,6.85831,4.290459,6.95822,4.248495,4.255998,6.95822,0.0,4.534633,4.042383,0.0,4.181881,4.248495,6.852859,4.290459
75%,9.160107,7.753689,0.0,7.222888,7.509749,0.0,7.249201,4.521789,7.753689,4.532599,8.449969,7.753689,0.0,5.944319,6.964886,0.0,6.270384,4.532599,7.753689,4.521789
max,52.64638,11.844385,8.247251,7.912068,58.677412,9.192034,7.892801,5.961005,11.844385,5.961005,36.440193,11.844385,6.247199,7.875643,36.440193,6.247199,7.875643,5.961005,11.844385,5.961005


In [12]:
# Write indicator function
def mse(coefs):
    alpha, beta = coefs
    #total = 0
    for i in ps4_mse_both.index:
        indicator = (ps4_mse_both['ob1'] + alpha * ps4_mse_both['ob2'] + beta * ps4_mse_both['ob3'] + 
                     ps4_mse_both['ob4'] + alpha * ps4_mse_both['ob5'] + beta * ps4_mse_both['ob6'] >=
                     ps4_mse_both['cf1'] + alpha * ps4_mse_both['cf2'] + beta * ps4_mse_both['cf3'] +
                     ps4_mse_both['cf4'] + alpha * ps4_mse_both['cf5'] + beta * ps4_mse_both['cf6'])
        total = -1 * sum(indicator)
        return total

In [13]:
# Write indicator function for the transfered verision
def mse_tansf(coefs):
    sigma, alpha, gamma, beta = coefs
    for i in ps4_mse_both.index:
        indicator = ((sigma * ps4_mse_both['ob1'] + alpha * ps4_mse_both['ob2'] + beta * ps4_mse_both['ob3'] + 
                      gamma * ps4_mse_both['ob7'] - ps4_mse_both['ob8'] >= 
                      sigma * ps4_mse_both['cf1'] + alpha * ps4_mse_both['cf2'] + beta * ps4_mse_both['cf3'] +
                      gamma * ps4_mse_both['cf7'] - ps4_mse_both['cf8']) & 
                     (sigma * ps4_mse_both['ob4'] + alpha * ps4_mse_both['ob5'] + beta * ps4_mse_both['ob6'] +
                      gamma * ps4_mse_both['ob9'] - ps4_mse_both['ob10'] >= 
                      sigma * ps4_mse_both['cf4'] + alpha * ps4_mse_both['cf5'] + beta * ps4_mse_both['cf6'] +
                      gamma * ps4_mse_both['cf9'] - ps4_mse_both['cf10'])) 
        total = -1 * sum(indicator)
        return total

In [14]:
# Call the minimizer
bnds = [(-5, 5), (-5, 5)]
mse_results = differential_evolution(mse, bnds, maxiter = 100, tol = 0.001)
# Because this method does not set the initial guess, so it randomly guesses the initials everytime. This is why
# you get different coefficients everytime. This also means it is sensitive to the initial guess.

In [15]:
params_initial = [1,1]
mse_results = opt.minimize(mse, params_initial, method='Nelder-Mead', tol = 1e-12, options={'maxiter': 5000})

In [16]:
# Call the minimizer of transfered model
params_initial_transf = [1,1,1,1]
mse_results_transf = opt.minimize(mse_tansf, params_initial_transf, method='Nelder-Mead', tol = 1e-12, options={'maxiter': 5000})

In [17]:
coefs = (['alpha', 'beta'])
for i in range(2):
    print('Estimated ', coefs[i], "in model(1) = ", mse_results['x'][i])

Estimated  alpha in model(1) =  7.375
Estimated  beta in model(1) =  -11.7


In [18]:
coefs_transf = (['sigma', 'alpha', 'gamma', 'beta'])
for i in range(4):
    print('Estimated ', coefs_transf[i], "in model(2) = ", mse_results_transf['x'][i])

Estimated  sigma in model(2) =  11.8554358734
Estimated  alpha in model(2) =  37.9998072781
Estimated  gamma in model(2) =  15.0825336215
Estimated  beta in model(2) =  -99.462796926
