# Regressions

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from os import *
import sys
sys.path.append("../")
import urllib.request
import requests
import json
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Change directory
def change_dir(newpath):
    get_path = sys.path[0].split("\\")      
    del get_path[-1]                                                  
    get_path.append(newpath)                    
    path = "\\".join(get_path)              
    os.chdir(path)

## Regression 1: Total employment

In [3]:
change_dir('output//stage2')
data = pd.read_csv("output.csv")
data.columns

Index(['Unnamed: 0', 'area_code', 'hispanic', 'white_nonhispanic',
       'black_nonhispanic', 'other_nonhispanic', 'asian_nonhispanic',
       'above_65', 'prepandemic_growth', 'leisure_by_emp', 'total_emp_feb2020',
       'manufacturing_emp_feb2020', 'service_emp_feb2020', 'total_emp_jun2021',
       'manufacturing_emp_jun2021', 'service_emp_jun2021', 'total_employment',
       'manufacturing', 'service', 'emp_tot_change_feb2020_jun2021',
       'manufacturing_change_feb2020_jun2021',
       'service_change_feb2020_jun2021', 'occ_15', 'total_pop', 'pop_excl',
       'less_than_9grade', 'hs_nodiploma', 'ged', 'college_nodegree',
       'associates', 'bachelors_graduate', 'CBSA Title', 'division', 'region',
       'state', 'pop_by_metro'],
      dtype='object')

In [4]:
# No transformations
x = data[['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']].copy()
y = data['emp_tot_change_feb2020_jun2021']
model1 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model1.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.084
Model:                                        OLS   Adj. R-squared:                  0.069
Method:                             Least Squares   F-statistic:                     5.741
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           4.36e-05
Time:                                    15:58:01   Log-Likelihood:                 550.44
No. Observations:                             320   AIC:                            -1089.
Df Residuals:                                 314   BIC:                            -1066.
Df Model:                                       5                                         
Covariance Type:                        nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
---

In [5]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']
for col in columns:
    data["log-" + col] = np.log(1 + data[col])
x = data[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic']].copy()
y = data['emp_tot_change_feb2020_jun2021']
model2 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model2.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.065
Model:                                        OLS   Adj. R-squared:                  0.050
Method:                             Least Squares   F-statistic:                     4.388
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           0.000709
Time:                                    15:58:01   Log-Likelihood:                 547.25
No. Observations:                             320   AIC:                            -1083.
Df Residuals:                                 314   BIC:                            -1060.
Df Model:                                       5                                         
Covariance Type:                        nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]

## Regression 2: Manufacturing

In [6]:
# No transformations
x = data[['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']].copy()
y = data['manufacturing_change_feb2020_jun2021']
model3 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model3.summary())

                                     OLS Regression Results                                     
Dep. Variable:     manufacturing_change_feb2020_jun2021   R-squared:                       0.064
Model:                                              OLS   Adj. R-squared:                  0.049
Method:                                   Least Squares   F-statistic:                     4.263
Date:                                  Wed, 11 Aug 2021   Prob (F-statistic):           0.000915
Time:                                          15:58:01   Log-Likelihood:                 365.73
No. Observations:                                   320   AIC:                            -719.5
Df Residuals:                                       314   BIC:                            -696.9
Df Model:                                             5                                         
Covariance Type:                              nonrobust                                         
                         coef 

In [7]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']
for col in columns:
    data["log-" + col] = np.log(1 + data[col])
x = data[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic']].copy()
y = data['manufacturing_change_feb2020_jun2021']
model4 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model4.summary())

                                     OLS Regression Results                                     
Dep. Variable:     manufacturing_change_feb2020_jun2021   R-squared:                       0.069
Model:                                              OLS   Adj. R-squared:                  0.054
Method:                                   Least Squares   F-statistic:                     4.677
Date:                                  Wed, 11 Aug 2021   Prob (F-statistic):           0.000392
Time:                                          15:58:01   Log-Likelihood:                 366.72
No. Observations:                                   320   AIC:                            -721.4
Df Residuals:                                       314   BIC:                            -698.8
Df Model:                                             5                                         
Covariance Type:                              nonrobust                                         
                             c

## Regression with added vars: population over 65 and divisions

In [8]:
# No transformations
data = pd.concat((data, pd.get_dummies(data['division'], drop_first=True)), axis=1)
x = data[['leisure_by_emp','occ_15','total_pop','bachelors_graduate','white_nonhispanic','above_65','East South Central Division', 
          'Middle Atlantic Division','Mountain Division', 'New England Division', 'Pacific Division','South Atlantic Division', 
          'West North Central Division','West South Central Division']].copy()
y = data['emp_tot_change_feb2020_jun2021']
model5 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model5.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.241
Model:                                        OLS   Adj. R-squared:                  0.206
Method:                             Least Squares   F-statistic:                     6.919
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           2.41e-12
Time:                                    15:58:01   Log-Likelihood:                 580.58
No. Observations:                             320   AIC:                            -1131.
Df Residuals:                                 305   BIC:                            -1075.
Df Model:                                      14                                         
Covariance Type:                        nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0

In [9]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate','white_nonhispanic','above_65']
for col in columns:
    data["log-" + col] = np.log(1 + data[col])
x = data[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic', 
         'log-above_65','prepandemic_growth','East South Central Division', 'Middle Atlantic Division','Mountain Division', 'New England Division', 
          'Pacific Division','South Atlantic Division', 'West North Central Division','West South Central Division']].copy()
y = data['emp_tot_change_feb2020_jun2021']    # manufacturing_change_feb2020_jun2021 emp_tot_change_feb2020_jun2021
model6 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model6.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.232
Model:                                        OLS   Adj. R-squared:                  0.194
Method:                             Least Squares   F-statistic:                     6.111
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           3.29e-11
Time:                                    15:58:01   Log-Likelihood:                 578.62
No. Observations:                             320   AIC:                            -1125.
Df Residuals:                                 304   BIC:                            -1065.
Df Model:                                      15                                         
Covariance Type:                        nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0

## Regression 4: Added variables for pandemic policies and health stats

In [10]:
change_dir('output//stage2')
output = pd.read_csv("final_output.csv")
output.columns

Index(['Unnamed: 0', 'area_code', 'hispanic', 'white_nonhispanic',
       'black_nonhispanic', 'other_nonhispanic', 'asian_nonhispanic',
       'above_65', 'prepandemic_growth', 'leisure_by_emp', 'total_emp_feb2020',
       'manufacturing_emp_feb2020', 'service_emp_feb2020', 'total_emp_jun2021',
       'manufacturing_emp_jun2021', 'service_emp_jun2021', 'total_employment',
       'manufacturing', 'service', 'emp_tot_change_feb2020_jun2021',
       'manufacturing_change_feb2020_jun2021',
       'service_change_feb2020_jun2021', 'occ_15', 'total_pop', 'pop_excl',
       'less_than_9grade', 'hs_nodiploma', 'ged', 'college_nodegree',
       'associates', 'bachelors_graduate', 'CBSA Title', 'division', 'region',
       'state', 'pop_by_metro', 'state_name', 'StringencyIndex',
       'GovernmentResponseIndex', 'ContainmentHealthIndex',
       'EconomicSupportIndex', 'deaths', 'deaths_by_pop'],
      dtype='object')

In [11]:
# No transformations
output = pd.concat((output, pd.get_dummies(output['division'], drop_first=True)), axis=1)
x = output[['leisure_by_emp','occ_15','total_pop','bachelors_graduate','white_nonhispanic','black_nonhispanic','asian_nonhispanic',
          'hispanic','above_65','East South Central Division','Middle Atlantic Division','Mountain Division', 
          'New England Division', 'Pacific Division','South Atlantic Division','West North Central Division','West South Central Division',
          'StringencyIndex','GovernmentResponseIndex', 'ContainmentHealthIndex','EconomicSupportIndex','deaths_by_pop']].copy()
y = output['emp_tot_change_feb2020_jun2021']
model7 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model7.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.329
Model:                                        OLS   Adj. R-squared:                  0.279
Method:                             Least Squares   F-statistic:                     6.619
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           6.59e-16
Time:                                    15:58:01   Log-Likelihood:                 600.28
No. Observations:                             320   AIC:                            -1155.
Df Residuals:                                 297   BIC:                            -1068.
Df Model:                                      22                                         
Covariance Type:                        nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0

In [12]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate','white_nonhispanic','black_nonhispanic','asian_nonhispanic',
          'hispanic','above_65','StringencyIndex','GovernmentResponseIndex', 'ContainmentHealthIndex','EconomicSupportIndex','deaths_by_pop']
for col in columns:
    output["log-" + col] = np.log(1 + output[col])
x = output[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic', 
         'log-above_65','prepandemic_growth','East South Central Division', 'Middle Atlantic Division','Mountain Division', 'New England Division', 
          'Pacific Division','South Atlantic Division', 'West North Central Division','West South Central Division',
          'log-StringencyIndex','log-GovernmentResponseIndex','log-ContainmentHealthIndex','log-EconomicSupportIndex','log-deaths_by_pop']].copy()
y = output['emp_tot_change_feb2020_jun2021']
# y = output['manufacturing_change_feb2020_jun2021']  
model8 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model8.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.279
Model:                                        OLS   Adj. R-squared:                  0.231
Method:                             Least Squares   F-statistic:                     5.789
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           9.38e-13
Time:                                    15:58:01   Log-Likelihood:                 588.82
No. Observations:                             320   AIC:                            -1136.
Df Residuals:                                 299   BIC:                            -1057.
Df Model:                                      20                                         
Covariance Type:                        nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0

## Correlations

In [13]:
variables = ['hispanic', 'white_nonhispanic',
       'black_nonhispanic', 'other_nonhispanic', 'asian_nonhispanic',
       'above_65', 'prepandemic_growth', 'leisure_by_emp', 'total_emp_feb2020',
       'manufacturing_emp_feb2020', 'service_emp_feb2020', 'total_emp_jun2021',
       'manufacturing_emp_jun2021', 'service_emp_jun2021', 'total_employment',
       'manufacturing', 'service', 'emp_tot_change_feb2020_jun2021',
       'manufacturing_change_feb2020_jun2021',
       'service_change_feb2020_jun2021', 'occ_15', 'total_pop',
       'less_than_9grade', 'hs_nodiploma', 'ged', 'college_nodegree',
       'associates', 'bachelors_graduate']
data[variables].corr()

Unnamed: 0,hispanic,white_nonhispanic,black_nonhispanic,other_nonhispanic,asian_nonhispanic,above_65,prepandemic_growth,leisure_by_emp,total_emp_feb2020,manufacturing_emp_feb2020,...,manufacturing_change_feb2020_jun2021,service_change_feb2020_jun2021,occ_15,total_pop,less_than_9grade,hs_nodiploma,ged,college_nodegree,associates,bachelors_graduate
hispanic,1.0,-0.767776,-0.259605,-0.044461,0.110625,-0.214445,0.299847,0.075919,0.127974,0.126157,...,-0.026371,0.043626,-0.079276,0.166088,0.839957,0.434803,-0.311682,0.08427,-0.247393,-0.189461
white_nonhispanic,-0.767776,1.0,-0.319018,-0.10585,-0.351676,0.34207,-0.256355,-0.142451,-0.260989,-0.253751,...,0.110332,0.065406,-0.037652,-0.282156,-0.698706,-0.456452,0.35076,-0.042227,0.339841,0.07858
black_nonhispanic,-0.259605,-0.319018,1.0,-0.168078,-0.096969,-0.138016,-0.065051,-0.015361,0.121188,0.105191,...,-0.179705,0.015919,0.028668,0.095343,-0.110138,0.250354,0.081834,-0.075295,-0.203539,-0.009016
other_nonhispanic,-0.044461,-0.10585,-0.168078,1.0,0.290225,-0.09272,-0.117746,0.264857,-0.048624,-0.057344,...,0.088357,-0.138049,-0.016646,-0.050381,-0.103136,-0.148151,-0.121861,0.281944,0.152445,0.01535
asian_nonhispanic,0.110625,-0.351676,-0.096969,0.290225,1.0,-0.180329,0.170805,0.149586,0.305248,0.325977,...,0.045882,-0.335892,0.359137,0.31498,0.087768,-0.247389,-0.383263,-0.131667,-0.057464,0.368241
above_65,-0.214445,0.34207,-0.138016,-0.09272,-0.180329,1.0,0.014718,0.271457,-0.147011,-0.171578,...,0.077756,0.047241,-0.252684,-0.142389,-0.226544,-0.007533,0.344368,0.034827,0.099417,-0.18987
prepandemic_growth,0.299847,-0.256355,-0.065051,-0.117746,0.170805,0.014718,1.0,0.178521,0.16013,0.177219,...,0.174551,0.185385,0.166018,0.184324,0.186836,-0.046949,-0.467627,0.125066,-0.077923,0.236942
leisure_by_emp,0.075919,-0.142451,-0.015361,0.264857,0.149586,0.271457,0.178521,1.0,-0.056061,-0.086559,...,0.091372,-0.140565,-0.16168,-0.035713,-0.019355,-0.066812,-0.083861,0.138651,-0.036514,0.033125
total_emp_feb2020,0.127974,-0.260989,0.121188,-0.048624,0.305248,-0.147011,0.16013,-0.056061,1.0,0.955248,...,-0.052454,-0.11202,0.338661,0.970697,0.093606,-0.090717,-0.260718,-0.227752,-0.194279,0.306814
manufacturing_emp_feb2020,0.126157,-0.253751,0.105191,-0.057344,0.325977,-0.171578,0.177219,-0.086559,0.955248,1.0,...,-0.083066,-0.093475,0.359648,0.962826,0.0983,-0.088459,-0.265179,-0.207791,-0.19281,0.29911


In [14]:
variables = ['white_nonhispanic','above_65', 'leisure_by_emp',  'occ_15', 'total_pop','bachelors_graduate','prepandemic_growth','manufacturing_change_feb2020_jun2021', 'emp_tot_change_feb2020_jun2021']
data[variables].corr()

Unnamed: 0,white_nonhispanic,above_65,leisure_by_emp,occ_15,total_pop,bachelors_graduate,prepandemic_growth,manufacturing_change_feb2020_jun2021,emp_tot_change_feb2020_jun2021
white_nonhispanic,1.0,0.34207,-0.142451,-0.037652,-0.282156,0.07858,-0.256355,0.110332,0.12015
above_65,0.34207,1.0,0.271457,-0.252684,-0.142389,-0.18987,0.014718,0.077756,0.075458
leisure_by_emp,-0.142451,0.271457,1.0,-0.16168,-0.035713,0.033125,0.178521,0.091372,-0.133986
occ_15,-0.037652,-0.252684,-0.16168,1.0,0.303812,0.64804,0.166018,0.176395,-0.004606
total_pop,-0.282156,-0.142389,-0.035713,0.303812,1.0,0.27061,0.184324,-0.051123,-0.104965
bachelors_graduate,0.07858,-0.18987,0.033125,0.64804,0.27061,1.0,0.236942,0.168255,-0.184515
prepandemic_growth,-0.256355,0.014718,0.178521,0.166018,0.184324,0.236942,1.0,0.174551,0.198965
manufacturing_change_feb2020_jun2021,0.110332,0.077756,0.091372,0.176395,-0.051123,0.168255,0.174551,1.0,0.397307
emp_tot_change_feb2020_jun2021,0.12015,0.075458,-0.133986,-0.004606,-0.104965,-0.184515,0.198965,0.397307,1.0
