In [1]:
library(conflicted)
library(readxl)
library(gtools)
library(plm)
library(tidyverse)
library(modelsummary)
library(huxtable)
library(dplyr)
#library(bda)
library(writexl)
#library(Hmisc)
library(sjPlot)
library(DescTools)
library(caTools)
library(psych)
library(pcse)
conflict_prefer("lag", "stats")
conflict_prefer("Mean", "modelsummary")
conflict_prefer("N", "modelsummary")
conflict_prefer("SD", "modelsummary")
conflicted::conflicts_prefer(plm::lag)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
Learn more about sjPlot with 'browseVignettes("sjPlot")'.

[1m[22m[90m[conflicted][39m Will prefer [1m[34mstats[39m[22m::lag over any other package.
[1m[22m[90m[conflicted][39m Will prefer [1m[34mmodelsummary[39m[22m::Mean over any other package.
[1m[22m[90m[conflicted][39m Will prefer [1m[34mmodelsummary[39m[22m::N over any other package.
[1m[22m[90m[conflicted][39m Will prefer [1m[34mmodelsummary[39m[22m::SD over any other package.
[1m[22m[90m[conflicted][39m Removing existing preference.
[1m[22m[90m[

In [2]:
df <- pdata.frame(read.csv(file.path(path.expand('~'), 'OneDrive/PhD Dissertation/Data_Code/Data/index_data_regvariables.csv')),
                  index=c("Numeric","Year"))
df["lnFDIstock_w"] <- as.vector(winsor(df["lnFDIstock"], trim = .01))
df_sum <- df %>% select('SDI','lnFDIstock_w','financial','urban','property','tax', 'war_intensity')
df_withoutsdi <- df %>% select('lnFDIstock_w','financial','urban','property','tax','war_intensity')


In [3]:
###统计性描述
datasummary(All(df_sum) ~ N + Min + Max + Mean + SD, data = df, fmt = 2,
            output = file.path(path.expand('~'), 'OneDrive/PhD Dissertation/Regression Results/summary.docx'))


In [4]:
#' correlation_matrix
#' Creates a publication-ready / formatted correlation matrix, using `Hmisc::rcorr` in the backend.
#'
#' @param df dataframe; containing numeric and/or logical columns to calculate correlations for
#' @param type character; specifies the type of correlations to compute; gets passed to `Hmisc::rcorr`; options are `"pearson"` or `"spearman"`; defaults to `"pearson"`
#' @param digits integer/double; number of decimals to show in the correlation matrix; gets passed to `formatC`; defaults to `3`
#' @param decimal.mark character; which decimal.mark to use; gets passed to `formatC`; defaults to `.`
#' @param use character; which part of the correlation matrix to display; options are `"all"`, `"upper"`, `"lower"`; defaults to `"all"`
#' @param show_significance boolean; whether to add `*` to represent the significance levels for the correlations; defaults to `TRUE`
#' @param replace_diagonal boolean; whether to replace the correlations on the diagonal; defaults to `FALSE`
#' @param replacement character; what to replace the diagonal and/or upper/lower triangles with; defaults to `""` (empty string)
#'
#' @return a correlation matrix
#' @export
#'
#' @examples
#' `correlation_matrix(iris)`
#' `correlation_matrix(mtcars)`
correlation_matrix <- function(df, 
                               type = "pearson",
                               digits = 3, 
                               decimal.mark = ".",
                               use = "all", 
                               show_significance = TRUE, 
                               replace_diagonal = FALSE, 
                               replacement = ""){
  
  # check arguments
  stopifnot({
    is.numeric(digits)
    digits >= 0
    use %in% c("all", "upper", "lower")
    is.logical(replace_diagonal)
    is.logical(show_significance)
    is.character(replacement)
  })
  # we need the Hmisc package for this
  require(Hmisc)
  
  # retain only numeric and boolean columns
  isNumericOrBoolean = vapply(df, function(x) is.numeric(x) | is.logical(x), logical(1))
  if (sum(!isNumericOrBoolean) > 0) {
    cat('Dropping non-numeric/-boolean column(s):', paste(names(isNumericOrBoolean)[!isNumericOrBoolean], collapse = ', '), '\n\n')
  }
  df = df[isNumericOrBoolean]
  
  # transform input data frame to matrix
  x <- as.matrix(df)
  
  # run correlation analysis using Hmisc package
  correlation_matrix <- Hmisc::rcorr(x, type = )
  R <- correlation_matrix$r # Matrix of correlation coeficients
  p <- correlation_matrix$P # Matrix of p-value 
  
  # transform correlations to specific character format
  Rformatted = formatC(R, format = 'f', digits = digits, decimal.mark = decimal.mark)
  
  # if there are any negative numbers, we want to put a space before the positives to align all
  if (sum(R < 0) > 0) {
    Rformatted = ifelse(R > 0, paste0(' ', Rformatted), Rformatted)
  }
  
  # add significance levels if desired
  if (show_significance) {
    # define notions for significance levels; spacing is important.
    stars <- ifelse(is.na(p), "   ", ifelse(p < .001, "***", ifelse(p < .01, "** ", ifelse(p < .05, "*  ", "   "))))
    Rformatted = paste0(Rformatted, stars)
  }
  # build a new matrix that includes the formatted correlations and their significance stars
  Rnew <- matrix(Rformatted, ncol = ncol(x))
  rownames(Rnew) <- colnames(x)
  colnames(Rnew) <- paste(colnames(x), "", sep =" ")
  
  # replace undesired values
  if (use == 'upper') {
    Rnew[lower.tri(Rnew, diag = replace_diagonal)] <- replacement
  } else if (use == 'lower') {
    Rnew[upper.tri(Rnew, diag = replace_diagonal)] <- replacement
  } else if (replace_diagonal) {
    diag(Rnew) <- replacement
  }
  
  return(Rnew)
}

#' save_correlation_matrix
#' Creates and save to file a fully formatted correlation matrix, using `correlation_matrix` and `Hmisc::rcorr` in the backend
#' @param df dataframe; passed to `correlation_matrix`
#' @param filename either a character string naming a file or a connection open for writing. "" indicates output to the console; passed to `write.csv`
#' @param ... any other arguments passed to `correlation_matrix`
#'
#' @return NULL
#'
#' @examples
#' `save_correlation_matrix(df = iris, filename = 'iris-correlation-matrix.csv')`
#' `save_correlation_matrix(df = mtcars, filename = 'mtcars-correlation-matrix.csv', digits = 3, use = 'lower')`
save_correlation_matrix = function(df, filename, ...) {
  write.csv2(correlation_matrix(df, ...), file = filename)
}

In [5]:
###相关性系数
library("Hmisc")
cor <- cor(df_withoutsdi, method = c("pearson"))

# library("GGally")
# library("tidyverse")
# df_withoutsdi %>% ggpairs
correlation_matrix(df_withoutsdi, type = "spearman", show_significance = TRUE, digits =2, use="lower")

df_withoutsdi %>% correlation_matrix(type="spearman", show_significance=TRUE, digits =2, use="lower") %>% as_hux() %>% quick_docx(file=file.path(path.expand('~'), 'OneDrive/PhD Dissertation/Regression Results/correlation.docx'))

Unnamed: 0,lnFDIstock_w,financial,urban,property,tax,war_intensity
lnFDIstock_w,1.00,,,,,
financial,0.31***,1.00,,,,
urban,-0.07,-0.38***,1.00,,,
property,0.08,0.33***,-0.11*,1.00,,
tax,-0.38***,-0.03,-0.22***,0.18**,1.00,
war_intensity,0.25***,-0.13*,0.22***,-0.13*,-0.18***,1.0


In [6]:
###方差膨胀因子
library(car)
reg.lm <- lm(SDI~lnFDIstock_w + financial + urban + property + tax + war_intensity, data = df)

vif(reg.lm)
vif_values <- vif(reg.lm)
mean(vif_values)

VIF_table<-hux(变量名 = names(vif_values), VIF = as.numeric(vif_values), VIF2 = 1/as.numeric(vif_values), add_colnames = TRUE)
quick_docx(VIF_table, file = path.expand("~/OneDrive/PhD Dissertation/Regression Results/VIFtable.docx"))


Loading required package: carData



In [7]:
####横截面依赖性检验
CD_test_results <- tibble(
  var_name = character(),
  z = numeric(),
  p.value = numeric(),
)
for (name in names(df_sum)) {
  if (name == "Numeric" || name == "Year" || name == "Region" || name == "incomegroup" || name == "Alpha.3.code" || name == "CountryName_CN") {
    next
  }
  result <- tryCatch({
    pcdtest(df_sum[[name]])
  }, warning = function(warning_condition) {
  }, error = function(error_condition) {
      print(paste("Error in", name))
  })
  CD_test_results <- CD_test_results %>% add_row(var_name = name, z = result$statistic[["z"]], p.value = result$p.value)
}


CD_test_results %>% as_hux() %>%
    set_number_format(2) %>%
    set_width(0.4) %>%
    quick_docx(file = path.expand("~/OneDrive/PhD Dissertation/Regression Results/cdtest.docx"))

In [72]:
###单位根检验
library(tseries)
###If p-value < 0.05 then no unit roots present.

###Augmented Dickey-Fuller Test
# lapply(df_sum, function(x) adf.test(x[!is.na(x)], 
#            alternative='stationary', k=1))

###Phillips-Perron
# lapply(df_sum, function(x) pp.test(x[!is.na(x)], 
#            alternative='stationary'))

###CIPS
# cipstest(df_sum$SDI, type = "none")

###kpss
# lapply(df_sum, function(x) kpss.test(x[!is.na(x)], 
#            "Level"))

###purtest concludes many tests, levinlin is one of them
purtest(df_sum, test = "levinlin", exo = "intercept", pmax=5)
tests_results <- tibble(
  var_name = character(),
  LLC = character(),
  CIPS = character(),
  ADF_Fisher = character(),
  PP_Fisher = character(),
  KPSS = character()
)
p.value.stars <- function (p) {
  if (p < 0.001) {
    return ("***")
  } else if (p < 0.01) {
    return ("**")
  } else if (p < 0.05) {
    return ("*")
  } else {
    return ("")
  }
}
for (name in names(df_sum)) {
  if (name == "Numeric" || name == "Year" || name == "Region" || name == "incomegroup" || name == "Alpha.3.code" || name == "CountryName_CN" || name == "war_intensity") {
    next
  }
  result <- purtest(df_sum[[name]], test = "levinlin", exo = "intercept", pmax=5)
  z <- result$statistic$statistic[['z']]
  p.value <- result$statistic$p.value[[1]]
  LLC <- paste(z, p.value.stars(p.value))
  result <- cipstest(df_sum[[name]], type = "none")
  z <- result$statistic[[1]]
  p.value <- result$p.value
  CIPS <- paste(z, p.value.stars(p.value))
  result <- adf.test(df_sum[[name]], alternative='stationary', k=1)
  z <- result$statistic[[1]]
  p.value <- result$p.value
  ADF_Fisher <- paste(z, p.value.stars(p.value))
  result <- pp.test(df_sum[[name]], alternative='stationary')
  z <- result$statistic[[1]]
  p.value <- result$p.value
  PP_Fisher <- paste(z, p.value.stars(p.value))
  result <- kpss.test(df_sum[[name]], "Level")
  z <- result$statistic[[1]]
  p.value <- result$p.value
  KPSS <- paste(z, p.value.stars(p.value))
  tests_results <- tests_results %>% add_row(var_name = name, LLC = LLC, CIPS = CIPS, ADF_Fisher = ADF_Fisher, PP_Fisher = PP_Fisher, KPSS = KPSS)
}
tests_results


	Levin-Lin-Chu Unit-Root Test (ex. var.: Individual Intercepts)

data:  df_sum
z = -5.4574, p-value = 2.416e-08
alternative hypothesis: stationarity


“the time series is short”
“p-value greater than printed p-value”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“the time series is short”
“p-value greater than printed p-value”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“p-value greater than printed p-value”
“the time series is short”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“p-value greater than printed p-value”
“the time series is short”
“p-value greater than printed p-value”
“p-value greater than printed p-value”
“the time series is short”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“p-value greater than printed p-value”
“the time series is short”
“p-value greater than printed p-value”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”
“p-value smaller than printed p-value”


var_name,LLC,CIPS,ADF_Fisher,PP_Fisher,KPSS
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
SDI,-2.84652718346144 **,-0.955618815746401,-4.4970348204141 *,-46.5733432892945 *,2.12184210935408 *
lnFDIstock_w,-9.40372380886515 ***,-1.50457747086295,-4.66815175363104 *,-43.3370367210953 *,0.131759659992849
financial,-4.47902711767813 ***,-1.74841393565695 *,-4.10130565598318 *,-32.2609667838565 *,0.171151442838443
urban,3.88031535025202,-0.784579339655591,-3.52664752147851 *,-23.4917633961703 *,0.261484868019208
property,-8.00612952794883 ***,-1.91464681868227 *,-4.3269194206098 *,-41.6620651183981 *,0.134821213598956
tax,13.1350773521207,-1.42053192441821,-5.11444537997827 *,-52.8844809500145 *,0.812654644832893 *


In [9]:
###如果p值高于0.05，表明homoskedasticity，不存在异方差
library(lmtest)
bptest(SDI~lnFDIstock_w + financial + urban + property + tax + war_intensity, data = df, studentize=F)

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric





	Breusch-Pagan test

data:  SDI ~ lnFDIstock_w + financial + urban + property + tax + war_intensity
BP = 32.181, df = 6, p-value = 1.506e-05


In [10]:
###协整检验

In [84]:
summary(df$lnFDIstock_w)

total sum of squares: 873.6633 
       id      time 
0.8045207 0.1373423 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.780   7.424   8.436   8.430   9.399  11.950 

In [102]:
lm <- plm(SDI ~ lnFDIstock_w + I(lnFDIstock_w^2) + financial + urban + property + tax + war_intensity, data=df, model="within", effect = "twoways")
a <- lm$coefficients[["lnFDIstock_w"]]
b <- lm$coefficients[["I(lnFDIstock_w^2)"]]
breakpoint <- -a/(2*b)
df$lnFDIstock_lower <- ifelse(df$lnFDIstock_w < breakpoint, df$lnFDIstock_w, 0)
df$lnFDIstock_upper <- ifelse(df$lnFDIstock_w >= breakpoint, df$lnFDIstock_w, 0)
df$lnFDIstock_is_upper <- ifelse(df$lnFDIstock_w >= breakpoint, 1, 0)
glm(SDI ~ lnFDIstock_lower + lnFDIstock_upper + lnFDIstock_is_upper, family="poisson", data=df) %>% summary()

“non-integer x = 0.470712”
“non-integer x = 0.472522”
“non-integer x = 0.479859”
“non-integer x = 0.496331”
“non-integer x = 0.510256”
“non-integer x = 0.518066”
“non-integer x = 0.522195”
“non-integer x = 0.531574”
“non-integer x = 0.531259”
“non-integer x = 0.542569”
“non-integer x = 0.538403”
“non-integer x = 0.536336”
“non-integer x = 0.542654”
“non-integer x = 0.537347”
“non-integer x = 0.534268”
“non-integer x = 0.526083”
“non-integer x = 0.481692”
“non-integer x = 0.480943”
“non-integer x = 0.484220”
“non-integer x = 0.479380”
“non-integer x = 0.480322”
“non-integer x = 0.477173”
“non-integer x = 0.484679”
“non-integer x = 0.481885”
“non-integer x = 0.496404”
“non-integer x = 0.508730”
“non-integer x = 0.502168”
“non-integer x = 0.517337”
“non-integer x = 0.518197”
“non-integer x = 0.511475”
“non-integer x = 0.525115”
“non-integer x = 0.528299”
“non-integer x = 0.442556”
“non-integer x = 0.449327”
“non-integer x = 0.460085”
“non-integer x = 0.458473”
“non-integer x = 0.466595”
“


Call:
glm(formula = SDI ~ lnFDIstock_lower + lnFDIstock_upper + lnFDIstock_is_upper, 
    family = "poisson", data = df)

Deviance Residuals: 
      Min         1Q     Median         3Q        Max  
-0.112491  -0.042601  -0.007873   0.040174   0.134132  

Coefficients: (2 not defined because of singularities)
                     Estimate Std. Error z value Pr(>|z|)
(Intercept)         -0.630487   0.405907  -1.553    0.120
lnFDIstock_lower    -0.004872   0.047364  -0.103    0.918
lnFDIstock_upper           NA         NA      NA       NA
lnFDIstock_is_upper        NA         NA      NA       NA

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1.0234  on 335  degrees of freedom
Residual deviance: 1.0129  on 334  degrees of freedom
AIC: Inf

Number of Fisher Scoring iterations: 4
