In [1]:
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LinearRegression

route = Path.home() / "OneDrive" / "PhD Dissertation" / "Data_Code" / "Data"
if not route.exists():
    route.mkdir(parents=True)
###Choose index_data
data1 = pd.read_csv(route / "index_data.csv")

###Choose final data
df_final = (
    pd.read_csv(Path.home() / "OneDrive" / "Rawdata" / "Data cleaning" / "df_final.csv")
    .astype({"Numeric": "int32", "Year": "int32"})
    .set_index(["Numeric", "Year"])
    .sort_index(level=["Numeric", "Year"])
)
df_final.drop(
    [col for col in df_final.columns if col.endswith("_y")], axis=1, inplace=True
)
df_final['taxpercent'] = np.log(df_final['Taxes less subsidies on products (current US$)']/df_final['Population, total'])

  pd.read_csv(Path.home() / "OneDrive" / "Rawdata" / "Data cleaning" / "df_final.csv")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [3]:
### Benchmark data
### Choose regression variables
VARIABLES = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='基准回归'")

data_raw = df_final[VARIABLES["Variables"]].merge(
    data1, on=["Numeric", "Year"], how="right"
)

###Check whether NA
data_raw.loc[:, data_raw.isnull().any()].isnull().sum().sort_values()

#data_raw.to_csv(route / "data_raw.csv", index=True)

FDIstock                            3
WGI                                37
Military expenditure (% of GDP)    49
taxpercent                         68
dtype: int64

In [4]:
data_filtered = data_raw.copy().set_index(["Numeric", "Year"])
years = list(range(2001, 2021))

In [5]:
###Interpolate benchmark data
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import PolynomialFeatures

def interpolate_with_linear_regression(threshold=10, polynomial=False):

    def inner(df):
        df = df.copy()
        for col in df.columns:
            if df[col].isna().values.any():
                na_mask = df[col].isna()
                if (~na_mask).sum() < threshold:
                    print(
                        f"Country {df.index[0][0]} '{col}' has less than {threshold} values, skip interpolation."
                    )
                    continue
                if not is_numeric_dtype(df[col]):
                    print(
                        f"Country {df.index[0][0]} '{col}' is not numeric dtype, skip interpolation."
                    )
                    continue
                lin_reg = LinearRegression()
                not_na_years = df.loc[~na_mask].index.get_level_values(1).values.reshape((-1, 1))
                na_years = df.loc[na_mask].index.get_level_values(1).values.reshape((-1, 1))
                if polynomial:
                    poly = PolynomialFeatures(2, include_bias=False)
                    X = poly.fit_transform(not_na_years)
                    lin_reg.fit(X, df.loc[~na_mask, col])
                    pred = lin_reg.predict(poly.fit_transform(na_years))
                else:
                    X = not_na_years
                    lin_reg.fit(X, df.loc[~na_mask, col])
                    pred = lin_reg.predict(na_years)
                df.loc[na_mask, col] = pred
        return df
    return inner

def select_countries(df, key):
    def count_years(df):
        return df.index.get_level_values(1).unique().shape[0]
    selected_countries = df.groupby("Numeric")[key].count() == count_years(df)
    selected_countries = selected_countries[selected_countries].index
    df = df.query("Numeric in @selected_countries")
    return df


def lnFDI(df, key):
    df.loc[df[key] < 1, key] = np.nan
    df[f"ln{key}"] = np.log(df[key])
    return df



In [6]:
###计算所需指标
data_filtered = lnFDI(data_filtered, "FDIstock")

data_interpolated = (
    data_filtered
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(8))
    .droplevel(0)
    .sort_index()
)

data_interpolated = select_countries(data_interpolated, "taxpercent")

##rename the column names
data_interpolated = data_interpolated.rename(
    columns={
        "Total natural resources rents (% of GDP)_x": "natural",
        "Urban population growth (annual %)_x": "urban",
        "taxpercent":"economic",
        "WGI":"WGI",
        "Military expenditure (% of GDP)":"military",
        "Fuel-exporting":"Fuelexporting"

    }
)

data_interpolated["Eastsouth"] = (
    (data_interpolated["Region"] == "E") | (data_interpolated["Region"] == "S")
).astype(int)



for year in range(2001, 2021):
    data_interpolated[f"incomegroup{year}"] = (
        (data_interpolated.loc[(slice(None), year), "incomegroup"] != "L")
        .astype(int)
        .droplevel(1)
        .loc[data_interpolated.index.get_level_values(0)]
        .values
    )

data_interpolated0 = data_interpolated.reset_index()
data_interpolated0["time_2008"] = (data_interpolated0["Year"] >= 2008).astype(int)
data_interpolated0["time_2009"] = (data_interpolated0["Year"] >= 2009).astype(int)

# natural_group = (data_interpolated0.groupby("Numeric")["Adjusted savings: natural resources depletion (% of GNI)"].mean() > data_interpolated0["Adjusted savings: natural resources depletion (% of GNI)"].mean()).astype(int)
# data_interpolated0["natural_group"] = data_interpolated0["Numeric"].map(natural_group)

data_interpolated0.to_csv(route / "index_data_regvariables.csv", index=True)
data_interpolated0.to_excel(route / "index_data_regvariables.xlsx", index=True)

Country 384 'taxpercent' has less than 8 values, skip interpolation.
Country 430 'taxpercent' has less than 8 values, skip interpolation.


In [7]:
data_interpolated0.describe()

Unnamed: 0,Numeric,Year,FDIstock,economic,WGI,natural,urban,military,IMFgroup_3,landlock,...,incomegroup2013,incomegroup2014,incomegroup2015,incomegroup2016,incomegroup2017,incomegroup2018,incomegroup2019,incomegroup2020,time_2008,time_2009
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,...,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,453.571429,2010.5,9795.013729,-inf,-0.565537,11.095096,3.683536,1.559499,0.971429,0.371429,...,0.428571,0.457143,0.428571,0.428571,0.428571,0.485714,0.542857,0.542857,0.65,0.6
std,254.387231,5.770404,24822.477841,,0.594884,10.062322,1.34714,1.004594,0.910461,0.483532,...,0.495226,0.498516,0.495226,0.495226,0.495226,0.500153,0.498516,0.498516,0.477311,0.490248
min,24.0,2001.0,-3731.739882,-inf,-1.726665,0.00236,-0.15095,0.0054,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,204.0,2005.75,658.04675,3.341067,-1.00833,4.46401,3.07573,0.889453,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,454.0,2010.5,2257.811941,4.11486,-0.587098,7.906814,3.889231,1.357919,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
75%,694.0,2015.25,6169.882939,4.859429,-0.238618,14.118181,4.510475,1.949634,2.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,894.0,2020.0,179564.8109,7.317111,0.875715,53.314762,7.604489,7.955662,2.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
data_interpolated0.loc[:, data_interpolated0.isnull().any()].isnull().sum().sort_values()

Series([], dtype: float64)

In [9]:
(8468)/28997

0.2920302100217264

In [10]:
data_interpolated.describe()

Unnamed: 0,FDIstock,economic,WGI,natural,urban,military,IMFgroup_3,landlock,SDI_Economy,SDI_Society,...,incomegroup2011,incomegroup2012,incomegroup2013,incomegroup2014,incomegroup2015,incomegroup2016,incomegroup2017,incomegroup2018,incomegroup2019,incomegroup2020
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,...,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,9795.013729,-inf,-0.565537,11.095096,3.683536,1.559499,0.971429,0.371429,0.389724,0.440463,...,0.428571,0.428571,0.428571,0.457143,0.428571,0.428571,0.428571,0.485714,0.542857,0.542857
std,24822.477841,,0.594884,10.062322,1.34714,1.004594,0.910461,0.483532,0.073294,0.085862,...,0.495226,0.495226,0.495226,0.498516,0.495226,0.495226,0.495226,0.500153,0.498516,0.498516
min,-3731.739882,-inf,-1.726665,0.00236,-0.15095,0.0054,0.0,0.0,0.150045,0.267529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,658.04675,3.341067,-1.00833,4.46401,3.07573,0.889453,0.0,0.0,0.344427,0.384467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2257.811941,4.11486,-0.587098,7.906814,3.889231,1.357919,1.0,0.0,0.369182,0.423367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,6169.882939,4.859429,-0.238618,14.118181,4.510475,1.949634,2.0,1.0,0.422502,0.482564,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,179564.8109,7.317111,0.875715,53.314762,7.604489,7.955662,2.0,1.0,0.63016,0.766186,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
print("、".join(data_interpolated.query("indebt==0")["CountryName_CN"].unique()))

UndefinedVariableError: name 'indebt' is not defined

In [None]:
print("、".join(data_interpolated.query("incomegroup2020==1")["CountryName_CN"].unique()))

安哥拉、博茨瓦纳、喀麦隆、佛得角、刚果（布）、贝宁、加蓬、加纳、肯尼亚、莱索托、毛里求斯、纳米比亚、尼日利亚、塞内加尔、南非、津巴布韦、斯威士兰、坦桑尼亚、赞比亚


In [None]:
data_interpolated.query("landlock==1 & Year==2001")[["CountryName_CN","incomegroup2020","Eastsouth"]]#.query("Eastsouth==1")

Unnamed: 0_level_0,Unnamed: 1_level_0,CountryName_CN,incomegroup2020,Eastsouth
Numeric,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
72,2001,博茨瓦纳,1,1
140,2001,中非,0,0
148,2001,乍得,0,0
231,2001,埃塞俄比亚,0,1
426,2001,莱索托,1,1
454,2001,马拉维,0,1
466,2001,马里,0,0
562,2001,尼日尔,0,0
646,2001,卢旺达,0,1
716,2001,津巴布韦,1,1


In [None]:
data_interpolated.query("Eastsouth==1").describe()

Unnamed: 0,FDIstock,economic,WGI,natural,urban,military,IMFgroup_3,indebt,landlock,Fuelexporting,...,incomegroup2011,incomegroup2012,incomegroup2013,incomegroup2014,incomegroup2015,incomegroup2016,incomegroup2017,incomegroup2018,incomegroup2019,incomegroup2020
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,...,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,12750.533308,4.455867,-0.355384,7.977364,3.430666,1.680952,0.8125,0.4375,0.5,0.0625,...,0.5,0.5,0.5,0.5625,0.5625,0.5625,0.5625,0.625,0.6875,0.6875
std,30337.916636,1.241426,0.59971,8.525506,1.688271,1.027702,0.951405,0.496855,0.500783,0.242441,...,0.500783,0.500783,0.500783,0.496855,0.496855,0.496855,0.496855,0.484881,0.464238,0.464238
min,56.913562,1.02693,-1.607977,0.00236,-0.15095,0.0054,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,972.705951,3.55775,-0.741155,3.203926,2.32013,1.029101,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3597.53109,4.264452,-0.455258,5.50072,3.892724,1.453399,0.0,0.0,0.5,0.0,...,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,8062.447027,5.406474,0.013477,8.912067,4.757959,2.131057,2.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,179564.8109,7.317111,0.875715,52.560773,7.604489,6.306711,2.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
#### instrumental variables data
VARIABLES_IV = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='iv'")

data_raw_iv = df_final[VARIABLES_IV["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_iv = (
    data_raw_iv
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(8))
    .droplevel(0)
    .sort_index()
)

df_iv["lniv1"] = np.log(
    df_iv["capitalport1"] * df_iv["Energy"] +1
)
df_iv["lniv1square"] = df_iv["lniv1"] * df_iv["lniv1"]

# df_iv['lniv2'] = np.log(df_iv['capitalport1'] * df_iv['Official exchange rate (LCU per US$, period average)'])
# df_iv['lniv2square'] = df_iv['lniv2']*df_iv['lniv2']
df_iv = select_countries(df_iv, "lniv1")
df_iv.to_csv(route / "ivdata.csv")

Country 132 'capitalport1' has less than 8 values, skip interpolation.
Country 180 'capitalport1' has less than 8 values, skip interpolation.


In [None]:
#### industry variables data
VARIABLES_industry = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='industry'")

data_raw_industry = df_final[VARIABLES_industry["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_industry = (
    data_raw_industry
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)
##rename
df_industry = df_industry.rename(
    columns={
        "Employment in industry (% of total employment) (modeled ILO estimate)_x": "indu_employ",
        "Employment in services (% of total employment) (modeled ILO estimate)_x": "serv_employ",
    }
)
df_industry = select_countries(df_industry, ["indu_employ","serv_employ"])
df_industry.to_csv(route / "industrydata.csv")

###environment product export data
VARIABLES_industry_2 = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='industry_2'")

data_raw_industry_2 = df_final[VARIABLES_industry_2["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
data_raw_industry_2["lnenviron"] = np.log(
    data_raw_industry_2[
        "Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports"
    ]+1
)

df_industry_2 = (
    data_raw_industry_2
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)

df_industry_2 = select_countries(df_industry_2, "lnenviron")
df_industry_2.to_csv(route / "industrydata_2.csv")

Country 24 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 10 values, skip interpolation.
Country 24 'lnenviron' has less than 10 values, skip interpolation.
Country 148 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 10 values, skip interpolation.
Country 148 'lnenviron' has less than 10 values, skip interpolation.
Country 180 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 10 values, skip interpolation.
Country 180 'lnenviron' has less than 10 values, skip interpolation.
Country 266 'Environmental goods exports|US Dollars|EC

In [None]:
#### Technology variables data
VARIABLES_technology = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='technology'")

data_raw_technology = df_final[VARIABLES_technology["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_technology = (
    data_raw_technology
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)
##rename
df_technology = df_technology.rename(
    columns={
        "Overall index": "overallin",
        "  ICT": "ict",
        "  Research and Development": "research",
    }
)
df_technology = select_countries(df_technology, "overallin")
df_technology.to_csv(route / "technologydata.csv")

Country 24 'Overall index' has less than 10 values, skip interpolation.
Country 24 '  ICT' has less than 10 values, skip interpolation.
Country 24 '  Research and Development' has less than 10 values, skip interpolation.
Country 140 'Overall index' has less than 10 values, skip interpolation.
Country 140 '  ICT' has less than 10 values, skip interpolation.
Country 140 '  Research and Development' has less than 10 values, skip interpolation.
Country 148 'Overall index' has less than 10 values, skip interpolation.
Country 148 '  ICT' has less than 10 values, skip interpolation.
Country 148 '  Research and Development' has less than 10 values, skip interpolation.
Country 426 'Overall index' has less than 10 values, skip interpolation.
Country 426 '  ICT' has less than 10 values, skip interpolation.
Country 426 '  Research and Development' has less than 10 values, skip interpolation.
Country 562 'Overall index' has less than 10 values, skip interpolation.
Country 562 '  ICT' has less than 

In [None]:
#### institution variables data
VARIABLES_institution = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='institution'")

data_raw_institution = df_final[VARIABLES_institution["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_institution = (
    data_raw_institution
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)
##rename
df_institution = df_institution.rename(columns={"Property Rights": "property"})

df_institution = select_countries(df_institution, "property")
df_institution.to_csv(route / "institutiondata.csv")

#####环境税数据
VARIABLES_institution_2 = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='institution_2'")

data_raw_institution_2 = df_final[VARIABLES_institution_2["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_institution_2 = (
    data_raw_institution_2
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)
##rename
df_institution_2 = df_institution_2.rename(
    columns={
        "Taxes on Pollution|ECGTEP|Taxes on Pollution|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes, Taxes on Pollution|Percent of GDP": "enviro_regu",
    }
)
df_institution_2 = select_countries(df_institution_2, "enviro_regu")
df_institution_2.to_csv(route / "df_institution_2.csv")

Country 24 'Taxes on Pollution|ECGTEP|Taxes on Pollution|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes, Taxes on Pollution|Percent of GDP' has less than 10 values, skip interpolation.
Country 72 'Taxes on Pollution|ECGTEP|Taxes on Pollution|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes, Taxes on Pollution|Percent of GDP' has less than 10 values, skip interpolation.
Country 140 'Taxes on Pollution|ECGTEP|Taxes on Pollution|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes, Taxes on Pollution|Percent of GDP' has less than 10 values, skip interpolation.
Country 204 'Taxes on Pollution|ECGTEP|Taxes on Pollution|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes, Taxes on Pollution|Percent of GDP' has less than 10 values, skip interpolation.
Country 231 'Taxes on Pollution|ECGTEP|Taxes on Pollution|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes, Taxes on Pol

In [None]:
#### finance variables data
VARIABLES_finance = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='finance'")

data_raw_finance = df_final[VARIABLES_finance["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_finance = (
    data_raw_finance
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)
##rename
df_finance = df_finance.rename(
    columns={"Commercial bank branches (per 100,000 adults)_x": "finance"}
)
df_finance = select_countries(df_finance, "finance")
df_finance.to_csv(route / "financedata.csv")

VARIABLES_finance_2 = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='finance_2'")

data_raw_finance_2 = df_final[VARIABLES_finance_2["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_finance_2 = (
    data_raw_finance_2
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)
df_finance_2 = select_countries(df_finance_2, "di02")
df_finance_2.to_csv(route / "financedata_2.csv")

Country 231 'Commercial bank branches (per 100,000 adults)_x' has less than 10 values, skip interpolation.
Country 266 'Commercial bank branches (per 100,000 adults)_x' has less than 10 values, skip interpolation.
Country 694 'Commercial bank branches (per 100,000 adults)_x' has less than 10 values, skip interpolation.
Country 231 'di02' has less than 10 values, skip interpolation.


In [None]:
#### 绿地投资
###fdi variables data
VARIABLES = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='greenfdistock'")

rawdata_greenfdi = df_final[VARIABLES ["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
rawdata_greenfdi = lnFDI(rawdata_greenfdi, "greenfdistock")

###补齐数据
df_greenfdi = (
    rawdata_greenfdi
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)

df_greenfdi = select_countries(df_greenfdi, "lngreenfdistock")

df_greenfdi.to_csv(route / "fdidata.csv")

####跨国并购
VARIABLES = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='mafdistock'")

rawdata_mafdi = df_final[VARIABLES ["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
rawdata_mafdi = lnFDI(rawdata_mafdi, "mafdistock")

###补齐数据
df_mafdi = (
    rawdata_mafdi
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)


df_mafdi = select_countries(df_mafdi, "lnmafdistock")

df_mafdi.to_csv(route / "fdidata_ma.csv")

Country 180 'mafdistock' has less than 10 values, skip interpolation.
Country 180 'lnmafdistock' has less than 10 values, skip interpolation.
Country 204 'mafdistock' has less than 10 values, skip interpolation.
Country 204 'lnmafdistock' has less than 10 values, skip interpolation.
Country 270 'mafdistock' has less than 10 values, skip interpolation.
Country 270 'lnmafdistock' has less than 10 values, skip interpolation.
Country 426 'mafdistock' has less than 10 values, skip interpolation.
Country 426 'lnmafdistock' has less than 10 values, skip interpolation.
Country 562 'mafdistock' has less than 10 values, skip interpolation.
Country 562 'lnmafdistock' has less than 10 values, skip interpolation.
Country 624 'mafdistock' has less than 10 values, skip interpolation.
Country 624 'lnmafdistock' has less than 10 values, skip interpolation.
Country 748 'mafdistock' has less than 10 values, skip interpolation.
Country 748 'lnmafdistock' has less than 10 values, skip interpolation.
Countr

In [None]:
VARIABLES = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='did'")

df_fdi = df_final[VARIABLES ["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
#rename
df_fdi = df_fdi.rename(
    columns={
        "Number of announced greenfield FDI projects, by destination": "numgreenfdi",
        "Number of net cross-border M&As by region/economy of seller": "nummergerfdi",
    }
).reset_index()

df_fdi["posti"] = (df_fdi["strategy"] <= df_fdi["Year"]).astype(int)
df_fdi["posti_2015"] = (df_fdi["Year"] >= 2015).astype(int)

df_fdi["treatment_greenfdi"] = (df_fdi["numgreenfdi"] > 0).astype(int)
df_fdi["treatment_mafdi"] = (df_fdi["nummergerfdi"] > 0).astype(int)
df_fdi.to_csv(route / "didfdi.csv")

In [None]:
print("、".join(df_mafdi["CountryName_CN"].unique()))

安哥拉、博茨瓦纳、喀麦隆、佛得角、中非、乍得、刚果（布）、埃塞俄比亚、加蓬、加纳、几内亚、肯尼亚、马达加斯加、马拉维、马里、毛里求斯、纳米比亚、尼日利亚、卢旺达、塞内加尔、塞拉利昂、南非、津巴布韦、乌干达、坦桑尼亚、布基纳法索、赞比亚


In [None]:
print("、".join(set(df_greenfdi["CountryName_CN"].unique())-set(df_mafdi["CountryName_CN"].unique())))

斯威士兰、尼日尔、莱索托、刚果（金）、贝宁、冈比亚、多哥、几内亚（比绍）


In [None]:
###fdi by different countries
##中国
VARIABLES_FDI_CN = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_CN'")

data_raw_FDI_CN = df_final[VARIABLES_FDI_CN["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)

data_raw_FDI_CN = lnFDI(data_raw_FDI_CN, "stock")
data_raw_FDI_CN['lnstock']/= 100



df_FDI_CN = (
    data_raw_FDI_CN
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
    .query('Year>2002')
)
df_FDI_CN = select_countries(df_FDI_CN, "lnstock")
df_FDI_CN.to_csv(route / "FDI_CN_data.csv")
df_FDI_CN.loc[:, df_FDI_CN.isnull().any()].isnull().sum().sort_values()
# df_FDI_CN[['lnstock']].describe()
df_FDI_CN.isnull().any().sum()

Country 748 'stock' has less than 10 values, skip interpolation.
Country 748 'lnstock' has less than 10 values, skip interpolation.
Country 854 'stock' has less than 10 values, skip interpolation.
Country 854 'lnstock' has less than 10 values, skip interpolation.


0

In [None]:
print("、".join(df_FDI_CN["CountryName_CN"].unique()))

安哥拉、博茨瓦纳、喀麦隆、佛得角、中非、乍得、刚果（布）、刚果（金）、贝宁、埃塞俄比亚、加蓬、冈比亚、加纳、几内亚、肯尼亚、莱索托、马达加斯加、马拉维、马里、毛里求斯、纳米比亚、尼日尔、尼日利亚、几内亚（比绍）、卢旺达、塞内加尔、塞拉利昂、南非、津巴布韦、多哥、乌干达、坦桑尼亚、赞比亚


In [None]:
###美国
VARIABLES_FDI_USBEA = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_USBEA'")



data_raw_FDI_USBEA = df_final[VARIABLES_FDI_USBEA["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year>2000")


data_raw_FDI_USBEA = lnFDI(data_raw_FDI_USBEA, "FDI_USBEA")

df_FDI_USBEA = (
    data_raw_FDI_USBEA
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(10))
    .droplevel(0)
    .sort_index()
)


df_FDI_USBEA = select_countries(df_FDI_USBEA, "lnFDI_USBEA")

df_FDI_USBEA.to_csv(route / "FDI_USBEA_data.csv")
df_FDI_USBEA

Country 132 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 132 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 140 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 140 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 148 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 148 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 270 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 270 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 324 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 324 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 450 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 450 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 466 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 466 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 562 'FDI_USB

Unnamed: 0_level_0,Unnamed: 1_level_0,FDI_USBEA,FDIstock,economic,WGI,natural,urban,military,incomegroup,IMFgroup_3,indebt,...,incomegroup2012,incomegroup2013,incomegroup2014,incomegroup2015,incomegroup2016,incomegroup2017,incomegroup2018,incomegroup2019,incomegroup2020,lnFDI_USBEA
Numeric,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
24,2001,1220.000000,10122.63300,2.101953,-1.293468,44.219477,5.627442,4.524354,L,1.0,0.0,...,1,1,1,1,1,1,1,1,1,7.106606
24,2002,1110.000000,11866.13700,1.952752,-1.338348,32.883716,5.623762,2.869932,L,1.0,0.0,...,1,1,1,1,1,1,1,1,1,7.012115
24,2003,1067.000000,15443.10880,1.026930,-1.235333,28.406697,5.645138,3.761479,L,1.0,0.0,...,1,1,1,1,1,1,1,1,1,6.972606
24,2004,1206.000000,17640.33660,2.424751,-1.301410,33.770717,5.685845,3.471180,LM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,7.095064
24,2005,1197.000000,16336.49970,2.532351,-1.209038,42.760074,5.674547,3.692241,LM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,7.087574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,2016,61.000000,18855.10083,4.075756,-0.348764,11.677213,4.406529,1.429305,LM,2.0,1.0,...,1,1,1,1,1,1,1,1,1,4.110874
894,2017,60.000000,19866.47730,4.588901,-0.371218,13.226440,4.373369,1.309596,LM,2.0,1.0,...,1,1,1,1,1,1,1,1,1,4.094345
894,2018,49.000000,20435.42730,4.761907,-0.398209,11.748346,4.322060,1.409462,LM,2.0,1.0,...,1,1,1,1,1,1,1,1,1,3.891820
894,2019,107.594771,19134.26163,4.567412,-0.476908,7.915065,4.265728,1.219375,LM,2.0,1.0,...,1,1,1,1,1,1,1,1,1,4.662371


In [None]:
print("、".join(df_FDI_NL["CountryName_CN"].unique()))

NameError: name 'df_FDI_NL' is not defined

In [None]:
###法国
VARIABLES_FDI_FR = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_FR'")


data_raw_FDI_FR = df_final[VARIABLES_FDI_FR["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year >= 2008")

df_FDI_FR = (
    data_raw_FDI_FR
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(6))
    .droplevel(0)
    .sort_index()
)
df_FDI_FR['FDI_FR'] /= 1_000_000

df_FDI_FR = lnFDI(df_FDI_FR, "FDI_FR")

df_FDI_FR = select_countries(df_FDI_FR, "lnFDI_FR")

df_FDI_FR.to_csv(route / "FDI_FR_data.csv")
df_FDI_FR

In [None]:
print("、".join(df_FDI_FR["CountryName_CN"].unique()))

In [None]:
###荷兰
VARIABLES_FDI_NL = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_NL'")

data_raw_FDI_NL = df_final[VARIABLES_FDI_NL["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year>=2008")

df_FDI_NL = (
    data_raw_FDI_NL
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression(6))
    .droplevel(0)
    .sort_index()
)
df_FDI_NL['FDI_NL'] /= 1_000_000

df_FDI_NL = lnFDI(df_FDI_NL, "FDI_NL")

df_FDI_NL = select_countries(df_FDI_NL, "lnFDI_NL")

df_FDI_NL.to_csv(route / "FDI_NL_data.csv")
df_FDI_NL

In [None]:
580/20

In [None]:
data_interpolated["CountryName_CN"] = df_final["CountryName_CN"]
data_interpolated["CountryName_CN"].unique()

import matplotlib.pyplot as plt
for col in pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Sheet2",
    na_values="..",
).query("一级指标 == '经济'")["Variables"]:
    if col not in data_interpolated:
        print(col)
        continue
    fig, ax = plt.subplots()
    sns.scatterplot(data=data_interpolated, x="lnFDIstock", y=col, hue="Numeric", ax=ax)

In [None]:
data_corrplot = data_filled.merge(
    df_final[[*VARIABLES["Variables"], "FDIstock"]], on=["Numeric", "Year"]
)
Variables_corrplot = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Sheet2",
    na_values="..",
)

In [None]:
economy_variables = Variables_corrplot.query("一级指标 == '经济'")["Variables"]
society_variables = Variables_corrplot.query("一级指标 == '社会'")["Variables"]
resource_variables = Variables_corrplot.query("一级指标 == '资源'")["Variables"]
ecology_variables = Variables_corrplot.query("一级指标 == '生态'")["Variables"]

In [None]:
# df_final = (
#     pd.read_excel(Path.home() / "Desktop" / "effects variables counting.xlsx")
# ).set_index(['Category','变量名'])
# df_final['count'] = df_final.count(axis='columns').sort_values()
# df_final.to_excel(Path.home() / "Desktop" / "counting.xlsx")

sns.heatmap(
    data_corrplot.drop(
        [
            "Numeric",
            "Year",
            "CountryName_CN",
            "Alpha-3 code",
            "incomegroup",
            "IMFgroup",
        ],
        axis=1,
    )[["lnFDIstock", *economy_variables]].corr(),
    cmap=sns.color_palette("icefire", as_cmap=True),
)

In [None]:
###生成heckman两阶段数据
variables_heckman = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='基准回归'")

country_heckman = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Countries",
    na_values="..",
)

data_heckman = (
    df_final[variables_heckman["Variables"]]
    .reset_index()
    .merge(country_heckman[["Numeric", "Alpha-3 code", "CountryName_CN"]], on="Numeric")
    .query("Numeric!=175 & Numeric!=728")
)
# data_heckman["war_intensity"] = data_heckman["war_intensity"].fillna(0)
data_heckman.loc[:, data_heckman.isnull().any()].isnull().sum().sort_values()

In [None]:
data_heckman_drop = data_heckman.copy().set_index(["Numeric", "Year"])
# data_filtered.to_excel(route / "raw_data.xlsx", index=False)


dropped_countries = defaultdict(list)
for numeric, new_df in data_heckman_drop.groupby(level=0):
    for col in new_df:
        continuous_nan = 0
        num_nan = 0
        current_continuous_nan = 0
        for year in years:
            if pd.isna(new_df.loc[(numeric, year), col]):
                num_nan += 1
                current_continuous_nan += 1
                if current_continuous_nan > continuous_nan:
                    continuous_nan = current_continuous_nan
            else:
                current_continuous_nan = 0
        if num_nan > 6 or continuous_nan > 5:
            dropped_countries[col].append(numeric)
mask = np.unique(np.array([v for value in dropped_countries.values() for v in value]))
print(52 - len(mask))
data_heckman_drop = data_heckman_drop.query("Numeric not in @mask").reset_index()

In [None]:
data_heckman_merge = data_heckman_drop.merge(
    data1[
        [
            "Numeric",
            "Year",
            "SDI",
            "SDI_Economy",
            "SDI_Society",
            "SDI_Resource",
            "SDI_Ecology",
        ]
    ],
    on=["Numeric", "Year"],
    how="left",
).set_index(["SDI", "SDI_Economy", "SDI_Society", "SDI_Resource", "SDI_Ecology"])
data_heckman_merge.loc[
    :, data_heckman_merge.isnull().any()
].isnull().sum().sort_values()

In [None]:
data_heckman_interpo = (
    data_heckman_merge.groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .drop(["Numeric"], axis=1)
    .reset_index()
)
data_heckman_interpo.loc[
    :, data_heckman_interpo.isnull().any()
].isnull().sum().sort_values()

data_heckman_interpo["lnFDI instock (Millions of dollars)"] = np.log(data_heckman_interpo["FDI instock (Millions of dollars)"] + 1)

data_heckman_interpo["lngnic"] = np.log(data_heckman_interpo["GNI per capita (US$)"])

data_heckman_interpo = data_heckman_interpo.rename(
    columns={
        "Urban population growth (annual %)_x": "urban",
        "Total natural resources rents (% of GDP)_x": "natural",
        "Commercial bank branches (per 100,000 adults)_x": "financial",
        "Statistical Capacity Score (Overall Average) (scale 0 - 100)_x": "stats",
        "Military expenditure (% of GDP)": "military",
    }
)

In [None]:
# Generate SSA and Region dummies
data_heckman_interpo["SDI_dum"] = data_heckman_interpo["SDI"].apply(
    lambda r: 1 if r > 0 else 0
)

In [None]:
data_heckman_interpo  # .query("SDI_dum==0")

In [None]:
data_heckman_interpo.to_csv(route / "heckmandata.csv", index=True)
data_heckman_interpo.reset_index().to_excel(route / "heckmandata.xlsx")

In [None]:
data_heckman_interpo["CountryName_CN"].unique()

In [None]:
data_heckman_interpo.describe()