In [1]:
library(dplyr)
library(MASS)    # for negative binomial regression
library(tidyr)
library(readr)   # read csv file
library(texreg)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

Version:  1.37.5
Date:     2020-06-17
Author:   Philip Leifeld (University of Essex)

Consider submitting praise using the praise or praise_interactive functions.
Please cite the JSS article in your publications -- see citation("texreg").

Attaching package: 'texreg'

The following object is masked from 'package:tidyr':

    extract



### <div class='info-circle alert alert-block alert-info'> Load combined data</div>

In [2]:
reg_df <- read_csv('Data/combined_data.csv')
dim(reg_df)

Parsed with column specification:
cols(
  .default = col_double(),
  city_cn = col_character(),
  city = col_character(),
  province_cn = col_character(),
  province = col_character(),
  tier = col_character()
)
See spec(...) for full column specifications.


In [3]:
head(reg_df)

city_cn,city,city_id,province_cn,province,tier,lat,lng,population,GDP,...,date_0324,date_0325,date_0326,date_0327,date_0328,date_0329,date_0330,date_0331,date_0401,hubei_city
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
北京市,Beijing,110000,北京市,Beijing,Tier 1,39.91092,116.4134,2153.6,35371.3,...,559,565,569,572,576,577,580,580,582,0
天津市,Tianjin,120000,天津市,Tianjin,new Tier 1,39.09367,117.2095,1561.83,14104.0,...,145,147,152,156,163,166,174,174,176,0
石家庄市,Shijiazhuang,130100,河北省,Hebei,Tier 2,38.04831,114.5215,1039.42,5809.9,...,29,29,29,29,29,29,29,29,29,0
唐山市,Tangshan,130200,河北省,Hebei,Tier 3,39.63658,118.1865,796.4,6890.0,...,58,58,58,58,58,58,58,58,58,0
秦皇岛市,Qinhuangdao,130300,河北省,Hebei,Tier 3,39.94175,119.6085,314.63,1612.02,...,10,10,10,10,10,10,10,10,10,0
邯郸市,Handan,130400,河北省,Hebei,Tier 3,36.63126,114.5456,954.97,3486.0,...,32,32,32,32,32,32,32,32,32,0


In [4]:
# filter Wuhan
reg_df <- reg_df %>% 
            filter(city != 'Wuhan')

In [5]:
dim(reg_df)

### <div class="alert alert-block alert-success"> Regression Analysis </div>
### Spatial Spread of COVID-19

* OLS Regression
* Negative Binomial Regression

Regressions are performed in both Python and R. This tutorial presents R implimentations. See 'Figure2-spatial analysis and plot.ipynb' for Python implimentations.

#### OLS Regression
log_cumulative_cases ~ log_local_flow + log_pagerank + log_population + log_distance + mean_intensity + city_tier1 + city_tier2 + city_tier3 + city_tier4

In [6]:
# cases on '2020-02-09'

reg_df$cumulative_cases <- reg_df$date_0209

# ==============
# ols regression

ols_model <- lm(log_cumulative_cases ~ log_local_flow + 
                    log_pagerank + 
                    log_population + 
                    log_distance + 
                    mean_intensity + 
                    city_tier1 + city_tier2 + city_tier3 + city_tier4, 
              data = reg_df %>% 
                    mutate(log_cumulative_cases = log(cumulative_cases + 1)) %>%
                    mutate(log_local_flow = log(local_flow + 1)) %>%
                    mutate(log_pagerank = log(pagerank)) %>%
                    mutate(log_population = log(population)) %>%
                    mutate(log_distance = log(distance + 1)),
            )    #identity link, OLS

summary(ols_model)


Call:
lm(formula = log_cumulative_cases ~ log_local_flow + log_pagerank + 
    log_population + log_distance + mean_intensity + city_tier1 + 
    city_tier2 + city_tier3 + city_tier4, data = reg_df %>% mutate(log_cumulative_cases = log(cumulative_cases + 
    1)) %>% mutate(log_local_flow = log(local_flow + 1)) %>% 
    mutate(log_pagerank = log(pagerank)) %>% mutate(log_population = log(population)) %>% 
    mutate(log_distance = log(distance + 1)))

Residuals:
     Min       1Q   Median       3Q      Max 
-2.34376 -0.42763 -0.00446  0.35829  2.50113 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.271233   1.456934  -0.873 0.383505    
log_local_flow  0.818665   0.060912  13.440  < 2e-16 ***
log_pagerank   -0.228125   0.111730  -2.042 0.041917 *  
log_population  0.241025   0.063239   3.811 0.000163 ***
log_distance   -0.172071   0.100267  -1.716 0.087014 .  
mean_intensity  0.090084   0.054868   1.642 0.101515    
city_tier1      0.580346  

#### Negative Binomial Regression
cumulative_cases ~ exp(log_local_flow + log_pagerank + log_population + log_distance + mean_intensity + city_tier1 + city_tier2 + city_tier3 + city_tier4)

In [7]:
# negative Binomial model
# in the MASS package

# cases on '2020-02-09'
reg_df$cumulative_cases <- reg_df$date_0209

negb_model <- glm.nb(cumulative_cases ~ 
                    log_local_flow + 
                    log_pagerank + 
                    log_population + 
                    log_distance + 
                    mean_intensity + 
                    city_tier1 + city_tier2 + city_tier3 + city_tier4, 
              data = reg_df %>% 
                    mutate(log_cumulative_cases = log(cumulative_cases + 1)) %>%
                    mutate(log_local_flow = log(local_flow + 1)) %>%
                    mutate(log_pagerank = log(pagerank)) %>%
                    mutate(log_population = log(population)) %>%
                    mutate(log_distance = log(distance + 1)),
                   )

summary(negb_model)


Call:
glm.nb(formula = cumulative_cases ~ log_local_flow + log_pagerank + 
    log_population + log_distance + mean_intensity + city_tier1 + 
    city_tier2 + city_tier3 + city_tier4, data = reg_df %>% mutate(log_cumulative_cases = log(cumulative_cases + 
    1)) %>% mutate(log_local_flow = log(local_flow + 1)) %>% 
    mutate(log_pagerank = log(pagerank)) %>% mutate(log_population = log(population)) %>% 
    mutate(log_distance = log(distance + 1)), init.theta = 2.249561936, 
    link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9110  -0.9914  -0.3612   0.2851   4.6693  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.638971   1.632921  -2.229  0.02585 *  
log_local_flow  0.833505   0.065673  12.692  < 2e-16 ***
log_pagerank   -0.397239   0.121905  -3.259  0.00112 ** 
log_population  0.412455   0.076828   5.369 7.94e-08 ***
log_distance   -0.141796   0.111713  -1.269  0.20434    
mean_intensity  0.160380   0.0

**parameter $\alpha$/alpha in Python implemented Negative Binomial Regression**

In [8]:
1/negb_model$theta

### Hierarchical regression analysis

In [9]:
# Hierarchical OLS
# --------------

tmp_df <- reg_df %>% drop_na(c('local_flow', 'population', 'pagerank'))

tmp_df <- tmp_df %>%
                mutate(log_cumulative_cases = log(cumulative_cases + 1)) %>%
                mutate(log_local_flow = log(local_flow + 1)) %>%
                mutate(log_pagerank = log(pagerank)) %>%
                mutate(log_population = log(population))


# cases on '2020-02-09'
reg_df$cumulative_cases <- reg_df$date_0209


lm1 <- lm(log_cumulative_cases ~ log_local_flow,
              data = tmp_df
         )


lm2 <- lm(log_cumulative_cases ~ log_population,
              data = tmp_df
         )


lm3 <- lm(log_cumulative_cases ~ log_pagerank,
              data = tmp_df
         )


lm4 <- lm(log_cumulative_cases ~ log_local_flow + log_pagerank + log_population,
              data = tmp_df
         )


In [10]:
print(texreg(list(lm1, lm2, lm3, lm4)))


\begin{table}
\begin{center}
\begin{tabular}{l c c c c}
\hline
 & Model 1 & Model 2 & Model 3 & Model 4 \\
\hline
(Intercept)      & $0.45^{***}$ & $-2.94^{***}$ & $12.43^{***}$ & $-2.33^{**}$ \\
                 & $(0.07)$     & $(0.34)$      & $(0.63)$      & $(0.86)$     \\
log\_local\_flow & $1.01^{***}$ &               &               & $0.93^{***}$ \\
                 & $(0.03)$     &               &               & $(0.03)$     \\
log\_population  &              & $1.01^{***}$  &               & $0.31^{***}$ \\
                 &              & $(0.06)$      &               & $(0.06)$     \\
log\_pagerank    &              &               & $1.59^{***}$  & $-0.20^{*}$  \\
                 &              &               & $(0.10)$      & $(0.10)$     \\
\hline
R$^2$            & $0.81$       & $0.43$        & $0.40$        & $0.82$       \\
Adj. R$^2$       & $0.81$       & $0.43$        & $0.40$        & $0.82$       \\
Num. obs.        & $364$        & $364$         & $364$   

In [11]:
# Hierarchical Negative Binomial
# --------------

tmp_df <- reg_df %>% drop_na(c('local_flow', 'population', 'pagerank'))

tmp_df <- tmp_df %>%
#                 mutate(log_cumulative_cases = log(cumulative_cases + 1)) %>%
                mutate(log_local_flow = log(local_flow + 1)) %>%
                mutate(log_pagerank = log(pagerank)) %>%
                mutate(log_population = log(population))


nbm1 <- glm.nb(cumulative_cases ~ log_local_flow,
               data = tmp_df
               )

nbm2 <- glm.nb(cumulative_cases ~ log_population,
              data = tmp_df
               )

nbm3 <- glm.nb(cumulative_cases ~ log_pagerank,
               data = tmp_df
               )


nbm4 <- glm.nb(cumulative_cases ~ log_local_flow + log_population + log_pagerank,
               data = tmp_df
               )


In [12]:
print(texreg(list(nbm1, nbm2, nbm3, nbm4)))


\begin{table}
\begin{center}
\begin{tabular}{l c c c c}
\hline
 & Model 1 & Model 2 & Model 3 & Model 4 \\
\hline
(Intercept)      & $0.68^{***}$ & $-1.66^{***}$ & $14.37^{***}$ & $-3.96^{***}$ \\
                 & $(0.08)$     & $(0.45)$      & $(0.79)$      & $(0.97)$      \\
log\_local\_flow & $0.99^{***}$ &               &               & $0.91^{***}$  \\
                 & $(0.03)$     &               &               & $(0.04)$      \\
log\_population  &              & $0.99^{***}$  &               & $0.45^{***}$  \\
                 &              & $(0.08)$      &               & $(0.07)$      \\
log\_pagerank    &              &               & $1.72^{***}$  & $-0.37^{***}$ \\
                 &              &               & $(0.13)$      & $(0.11)$      \\
\hline
AIC              & $2780.61$    & $3310.38$     & $3306.88$     & $2741.73$     \\
BIC              & $2792.30$    & $3322.07$     & $3318.57$     & $2761.22$     \\
Log Likelihood   & $-1387.30$   & $-1652.19$    