# Regressions

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from os import *
import sys
sys.path.append("../")
import urllib.request
import requests
import json
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Change directory
def change_dir(newpath):
    get_path = sys.path[0].split("\\")      
    del get_path[-1]                                                  
    get_path.append(newpath)                    
    path = "\\".join(get_path)              
    os.chdir(path)

## Regression 1: Total employment

In [3]:
change_dir('output//stage2')
data = pd.read_csv("output.csv")
data.columns

Index(['Unnamed: 0', 'area_code', 'hispanic', 'white_nonhispanic',
       'black_nonhispanic', 'other_nonhispanic', 'asian_nonhispanic',
       'above_65', 'prepandemic_growth', 'leisure_by_emp', 'total_emp_feb2020',
       'manufacturing_emp_feb2020', 'service_emp_feb2020', 'total_emp_jun2021',
       'manufacturing_emp_jun2021', 'service_emp_jun2021', 'total_employment',
       'manufacturing', 'service', 'emp_tot_change_feb2020_jun2021',
       'manufacturing_change_feb2020_jun2021',
       'service_change_feb2020_jun2021', 'occ_15', 'total_pop', 'pop_excl',
       'less_than_9grade', 'hs_nodiploma', 'ged', 'college_nodegree',
       'associates', 'bachelors_graduate', 'CBSA Title', 'division', 'region',
       'pop_by_metro'],
      dtype='object')

In [4]:
# No transformations
x = data[['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']].copy()
y = data['emp_tot_change_feb2020_jun2021']
model1 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model1.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.097
Model:                                        OLS   Adj. R-squared:                  0.083
Method:                             Least Squares   F-statistic:                     6.757
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           5.33e-06
Time:                                    08:08:18   Log-Likelihood:                 552.80
No. Observations:                             320   AIC:                            -1094.
Df Residuals:                                 314   BIC:                            -1071.
Df Model:                                       5                                         
Covariance Type:                        nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
---

In [5]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']
for col in columns:
    data["log-" + col] = np.log(1 + data[col])
x = data[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic']].copy()
y = data['emp_tot_change_feb2020_jun2021']
model2 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model2.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.076
Model:                                        OLS   Adj. R-squared:                  0.061
Method:                             Least Squares   F-statistic:                     5.169
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           0.000142
Time:                                    08:08:18   Log-Likelihood:                 549.10
No. Observations:                             320   AIC:                            -1086.
Df Residuals:                                 314   BIC:                            -1064.
Df Model:                                       5                                         
Covariance Type:                        nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]

## Regression 2: Manufacturing

In [6]:
# No transformations
x = data[['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']].copy()
y = data['manufacturing_change_feb2020_jun2021']
model3 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model3.summary())

                                     OLS Regression Results                                     
Dep. Variable:     manufacturing_change_feb2020_jun2021   R-squared:                       0.064
Model:                                              OLS   Adj. R-squared:                  0.049
Method:                                   Least Squares   F-statistic:                     4.284
Date:                                  Wed, 11 Aug 2021   Prob (F-statistic):           0.000876
Time:                                          08:08:18   Log-Likelihood:                 365.78
No. Observations:                                   320   AIC:                            -719.6
Df Residuals:                                       314   BIC:                            -697.0
Df Model:                                             5                                         
Covariance Type:                              nonrobust                                         
                         coef 

In [7]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate', 'white_nonhispanic']
for col in columns:
    data["log-" + col] = np.log(1 + data[col])
x = data[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic']].copy()
y = data['manufacturing_change_feb2020_jun2021']
model4 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model4.summary())

                                     OLS Regression Results                                     
Dep. Variable:     manufacturing_change_feb2020_jun2021   R-squared:                       0.070
Model:                                              OLS   Adj. R-squared:                  0.055
Method:                                   Least Squares   F-statistic:                     4.740
Date:                                  Wed, 11 Aug 2021   Prob (F-statistic):           0.000344
Time:                                          08:08:18   Log-Likelihood:                 366.87
No. Observations:                                   320   AIC:                            -721.7
Df Residuals:                                       314   BIC:                            -699.1
Df Model:                                             5                                         
Covariance Type:                              nonrobust                                         
                             c

## Regression with added vars: population over 65 and divisions

In [8]:
# No transformations
data = pd.concat((data, pd.get_dummies(data['division'], drop_first=True)), axis=1)
x = data[['leisure_by_emp','occ_15','total_pop','bachelors_graduate','white_nonhispanic','above_65','East South Central Division', 
          'Middle Atlantic Division','Mountain Division', 'New England Division', 'Pacific Division','South Atlantic Division', 
          'West North Central Division','West South Central Division']].copy()
y = data['emp_tot_change_feb2020_jun2021']
model5 = sm.OLS(y,sm.add_constant(x),missing='drop').fit()
print(model5.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.254
Model:                                        OLS   Adj. R-squared:                  0.219
Method:                             Least Squares   F-statistic:                     7.403
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           2.53e-13
Time:                                    08:08:18   Log-Likelihood:                 583.26
No. Observations:                             320   AIC:                            -1137.
Df Residuals:                                 305   BIC:                            -1080.
Df Model:                                      14                                         
Covariance Type:                        nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0

In [22]:
# Log transformed
columns = ['leisure_by_emp','occ_15','total_pop','bachelors_graduate','white_nonhispanic','above_65']
for col in columns:
    data["log-" + col] = np.log(1 + data[col])
x = data[['log-leisure_by_emp','log-occ_15','log-total_pop','log-bachelors_graduate', 'log-white_nonhispanic', 
         'log-above_65','prepandemic_growth','East South Central Division', 'Middle Atlantic Division','Mountain Division', 'New England Division', 
          'Pacific Division','South Atlantic Division', 'West North Central Division','West South Central Division']].copy()
y = data['emp_tot_change_feb2020_jun2021']    # manufacturing_change_feb2020_jun2021 emp_tot_change_feb2020_jun2021
model6 = sm.OLS(y,sm.add_constant(x),missing="drop").fit()
print(model6.summary())

                                  OLS Regression Results                                  
Dep. Variable:     emp_tot_change_feb2020_jun2021   R-squared:                       0.245
Model:                                        OLS   Adj. R-squared:                  0.207
Method:                             Least Squares   F-statistic:                     6.559
Date:                            Wed, 11 Aug 2021   Prob (F-statistic):           3.57e-12
Time:                                    08:35:33   Log-Likelihood:                 581.31
No. Observations:                             320   AIC:                            -1131.
Df Residuals:                                 304   BIC:                            -1070.
Df Model:                                      15                                         
Covariance Type:                        nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0

## Correlations

In [10]:
variables = ['hispanic', 'white_nonhispanic',
       'black_nonhispanic', 'other_nonhispanic', 'asian_nonhispanic',
       'above_65', 'prepandemic_growth', 'leisure_by_emp', 'total_emp_feb2020',
       'manufacturing_emp_feb2020', 'service_emp_feb2020', 'total_emp_jun2021',
       'manufacturing_emp_jun2021', 'service_emp_jun2021', 'total_employment',
       'manufacturing', 'service', 'emp_tot_change_feb2020_jun2021',
       'manufacturing_change_feb2020_jun2021',
       'service_change_feb2020_jun2021', 'occ_15', 'total_pop',
       'less_than_9grade', 'hs_nodiploma', 'ged', 'college_nodegree',
       'associates', 'bachelors_graduate']
data[variables].corr()

Unnamed: 0,hispanic,white_nonhispanic,black_nonhispanic,other_nonhispanic,asian_nonhispanic,above_65,prepandemic_growth,leisure_by_emp,total_emp_feb2020,manufacturing_emp_feb2020,...,manufacturing_change_feb2020_jun2021,service_change_feb2020_jun2021,occ_15,total_pop,less_than_9grade,hs_nodiploma,ged,college_nodegree,associates,bachelors_graduate
hispanic,1.0,-0.769992,-0.257587,-0.044393,0.115404,-0.217538,0.303124,0.074948,0.135874,0.131548,...,-0.024911,0.043357,-0.073798,0.162769,0.841954,0.429063,-0.319263,0.074926,-0.261344,-0.17531
white_nonhispanic,-0.769992,1.0,-0.317044,-0.108344,-0.355175,0.340793,-0.250248,-0.130857,-0.265105,-0.257527,...,0.109901,0.071648,-0.034011,-0.285645,-0.704804,-0.457694,0.347349,-0.053812,0.34585,0.085052
black_nonhispanic,-0.257587,-0.317044,1.0,-0.169605,-0.098584,-0.131105,-0.079321,-0.027709,0.108297,0.099658,...,-0.184401,0.004908,0.010951,0.105273,-0.103918,0.263386,0.100835,-0.043292,-0.186554,-0.046091
other_nonhispanic,-0.044393,-0.108344,-0.169605,1.0,0.287015,-0.096387,-0.120773,0.256816,-0.049234,-0.058075,...,0.084402,-0.133504,-0.017841,-0.050401,-0.096943,-0.139348,-0.117665,0.276946,0.143544,0.013294
asian_nonhispanic,0.115404,-0.355175,-0.098584,0.287015,1.0,-0.184339,0.172906,0.141309,0.326032,0.337401,...,0.046143,-0.339955,0.371094,0.319057,0.090867,-0.254245,-0.390032,-0.136905,-0.069469,0.383549
above_65,-0.217538,0.340793,-0.131105,-0.096387,-0.184339,1.0,0.018673,0.301415,-0.150105,-0.175361,...,0.075463,0.057716,-0.255646,-0.145639,-0.230858,-0.00643,0.341647,0.027204,0.109528,-0.189363
prepandemic_growth,0.303124,-0.250248,-0.079321,-0.120773,0.172906,0.018673,1.0,0.178521,0.16013,0.177219,...,0.174551,0.185385,0.166018,0.173205,0.189242,-0.054856,-0.473282,0.129102,-0.086641,0.247749
leisure_by_emp,0.074948,-0.130857,-0.027709,0.256816,0.141309,0.301415,0.178521,1.0,-0.056061,-0.086559,...,0.091372,-0.140565,-0.16168,-0.045593,-0.017506,-0.058935,-0.070886,0.148473,-0.023132,0.015296
total_emp_feb2020,0.135874,-0.265105,0.108297,-0.049234,0.326032,-0.150105,0.16013,-0.056061,1.0,0.955248,...,-0.052454,-0.11202,0.338661,0.996901,0.095515,-0.104564,-0.261991,-0.218192,-0.199625,0.311454
manufacturing_emp_feb2020,0.131548,-0.257527,0.099658,-0.058075,0.337401,-0.175361,0.177219,-0.086559,0.955248,1.0,...,-0.083066,-0.093475,0.359648,0.956323,0.098453,-0.099629,-0.268,-0.202718,-0.199724,0.306963


In [18]:
variables = ['white_nonhispanic','above_65', 'leisure_by_emp',  'occ_15', 'total_pop','bachelors_graduate','prepandemic_growth','manufacturing_change_feb2020_jun2021', 'emp_tot_change_feb2020_jun2021']
data[variables].corr()

Unnamed: 0,white_nonhispanic,above_65,leisure_by_emp,occ_15,total_pop,bachelors_graduate,prepandemic_growth,manufacturing_change_feb2020_jun2021,emp_tot_change_feb2020_jun2021
white_nonhispanic,1.0,0.340793,-0.130857,-0.034011,-0.285645,0.085052,-0.250248,0.109901,0.125373
above_65,0.340793,1.0,0.301415,-0.255646,-0.145639,-0.189363,0.018673,0.075463,0.083954
leisure_by_emp,-0.130857,0.301415,1.0,-0.16168,-0.045593,0.015296,0.178521,0.091372,-0.133986
occ_15,-0.034011,-0.255646,-0.16168,1.0,0.317818,0.651748,0.166018,0.176395,-0.004606
total_pop,-0.285645,-0.145639,-0.045593,0.317818,1.0,0.289065,0.173205,-0.051514,-0.109234
bachelors_graduate,0.085052,-0.189363,0.015296,0.651748,0.289065,1.0,0.247749,0.169969,-0.195915
prepandemic_growth,-0.250248,0.018673,0.178521,0.166018,0.173205,0.247749,1.0,0.174551,0.198965
manufacturing_change_feb2020_jun2021,0.109901,0.075463,0.091372,0.176395,-0.051514,0.169969,0.174551,1.0,0.397307
emp_tot_change_feb2020_jun2021,0.125373,0.083954,-0.133986,-0.004606,-0.109234,-0.195915,0.198965,0.397307,1.0
