In [1]:
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LinearRegression

route = Path.home() / "OneDrive" / "PhD Dissertation" / "Data_Code" / "Data"
if not route.exists():
    route.mkdir(parents=True)
###Choose index_data
data1 = pd.read_csv(route / "index_data.csv")

###Choose final data
df_final = (
    pd.read_csv(Path.home() / "OneDrive" / "Rawdata" / "Data cleaning" / "df_final.csv")
    .astype({"Numeric": "int32", "Year": "int32"})
    .set_index(["Numeric", "Year"])
    .sort_index(level=["Numeric", "Year"])
)
df_final.drop(
    [col for col in df_final.columns if col.endswith("_y")], axis=1, inplace=True
)
df_final['taxpercent'] = df_final['Taxes less subsidies on products (current US$)']/df_final['Population, total']

  pd.read_csv(Path.home() / "OneDrive" / "Rawdata" / "Data cleaning" / "df_final.csv")


In [2]:
### Benchmark data
### Choose regression variables
VARIABLES = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='基准回归'")

data_raw = df_final[VARIABLES["Variables"]].merge(
    data1, on=["Numeric", "Year"], how="right"
)

data_raw.loc[data_raw['FDIstock']<1, "FDIstock"] = np.nan

###Check whether NA
data_raw.loc[:, data_raw.isnull().any()].isnull().sum().sort_values()
data_raw["lnFDIstock"] = np.log(data_raw["FDIstock"])
data_raw.to_csv(route / "data_raw.csv", index=True)

In [3]:
data_filtered = data_raw.copy().set_index(["Numeric", "Year"])
years = list(range(2001, 2021))

###drop countries
missing_by_country = data_filtered.groupby("Numeric").apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 12).all(axis=1)].index

data_filtered = data_filtered.query("Numeric in @selected_countries")
sorted_by_country = missing_by_country.sum(axis=1).sort_values().index
sorted_by_variable = missing_by_country.sum(axis=0).sort_values().index
missing_by_country.loc[sorted_by_country, sorted_by_variable].merge(data_raw[['Numeric', 'CountryName_CN']], left_index=True, right_on='Numeric').to_excel(route / 'Missing.xlsx')

In [4]:
###Interpolate benchmark data
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import PolynomialFeatures

def interpolate_with_linear_regression(df, polynomial=False):
    df = df.copy()
    for col in df.columns:
        if df[col].isna().values.any():
            na_mask = df[col].isna()
            if (~na_mask).sum() < 8:
                print(na_mask.sum(), (~na_mask).sum())
                print(
                    f"Country {df.index[0][0]} '{col}' has less than 12 values, skip interpolation."
                )
                continue
            if not is_numeric_dtype(df[col]):
                print(
                    f"Country {df.index[0][0]} '{col}' is not numeric dtype, skip interpolation."
                )
                continue
            lin_reg = LinearRegression()
            not_na_years = df.loc[~na_mask].index.get_level_values(1).values.reshape((-1, 1))
            na_years = df.loc[na_mask].index.get_level_values(1).values.reshape((-1, 1))
            if polynomial:
                poly = PolynomialFeatures(2, include_bias=False)
                X = poly.fit_transform(not_na_years)
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(poly.fit_transform(na_years))
            else:
                X = not_na_years
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(na_years)
            df.loc[na_mask, col] = pred
    return df


data_interpolated = (
    data_filtered
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)


##rename the column names
data_interpolated = data_interpolated.rename(
    columns={
        "Total natural resources rents (% of GDP)_x": "natural",
        "Urban population growth (annual %)_x": "urban",
        "taxpercent":"economic",
        "WGI":"WGI",
        "Military expenditure (% of GDP)":"military",    

    }
)

In [5]:
###计算所需指标

# data_interpolated["lngnic"] = np.log(data_interpolated["GNI per capita (constant 2015 US$)_x"])

data_interpolated["Eastsouth"] = (
    (data_interpolated["Region"] == "E") | (data_interpolated["Region"] == "S")
).astype(int)



for year in range(2001, 2021):
    data_interpolated[f"incomegroup{year}"] = (
        (data_interpolated.loc[(slice(None), year), "incomegroup"] != "L")
        .astype(int)
        .droplevel(1)
        .loc[data_interpolated.index.get_level_values(0)]
        .values
    )

data_interpolated0 = data_interpolated.reset_index()
data_interpolated0["time_2008"] = (data_interpolated0["Year"] >= 2008).astype(int)
data_interpolated0["time_2009"] = (data_interpolated0["Year"] >= 2009).astype(int)

data_interpolated0.to_csv(route / "index_data_regvariables.csv", index=True)
data_interpolated0.to_excel(route / "index_data_regvariables.xlsx", index=True)

In [6]:
print("，".join(data_interpolated.query("IMFgroup==0")["CountryName_CN"].unique()))

佛得角，贝宁，埃塞俄比亚，冈比亚，肯尼亚，莱索托，马达加斯加，马拉维，毛里求斯，几内亚（比绍），卢旺达，塞内加尔，斯威士兰，多哥，乌干达


In [7]:
len(data_interpolated["CountryName_CN"].unique())

35

In [8]:
#### instrumental variables data
VARIABLES_IV = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='iv'")

data_raw_iv = df_final[VARIABLES_IV["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_iv = (
    data_raw_iv
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)

df_iv["lniv1"] = np.log(
    df_iv["seadistance"] * df_iv["Official exchange rate (LCU per US$, period average)"]
)
df_iv["lniv1square"] = df_iv["lniv1"] * df_iv["lniv1"]

# df_iv['lniv2'] = np.log(df_iv['capitalport1'] * df_iv['Official exchange rate (LCU per US$, period average)'])
# df_iv['lniv2square'] = df_iv['lniv2']*df_iv['lniv2']
df_iv.to_csv(route / "ivdata.csv")

In [9]:
#### industry variables data
VARIABLES_industry = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='industry'")

data_raw_industry = df_final[VARIABLES_industry["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_industry = (
    data_raw_industry
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
##rename
df_industry = df_industry.rename(
    columns={
        "Employment in industry (% of total employment) (modeled ILO estimate)_x": "indu_employ",
        "Employment in services (% of total employment) (modeled ILO estimate)_x": "serv_employ",
    }
)
df_industry.to_csv(route / "industrydata.csv")

###environment product export data
VARIABLES_industry_2 = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='industry_2'")

data_raw_industry_2 = df_final[VARIABLES_industry_2["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_industry_2 = (
    data_raw_industry_2
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
df_industry_2["lnenviron"] = np.log(
    df_industry_2[
        "Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports"
    ]
)
df_industry_2.to_csv(route / "industrydata_2.csv")

14 6
Country 24 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 12 values, skip interpolation.
20 0
Country 148 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 12 values, skip interpolation.
14 6
Country 180 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 12 values, skip interpolation.
17 3
Country 624 'Environmental goods exports|US Dollars|ECBTGX|Trade in Environmental Goods; Exports|Environment, Climate Change, Cross-Border, Trade-Related, Trade in Environmental Goods, Exports|Exports' has less than 12 values, skip interpol

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
#### Technology variables data
VARIABLES_technology = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='technology'")

data_raw_technology = df_final[VARIABLES_technology["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_technology = (
    data_raw_technology
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
##rename
df_technology = df_technology.rename(
    columns={
        "Overall index": "overallin",
        "  ICT": "ict",
        "  Research and Development": "research",
    }
)
df_technology.to_csv(route / "technologydata.csv")

20 0
Country 24 'Overall index' has less than 12 values, skip interpolation.
20 0
Country 24 '  ICT' has less than 12 values, skip interpolation.
20 0
Country 24 '  Research and Development' has less than 12 values, skip interpolation.
20 0
Country 140 'Overall index' has less than 12 values, skip interpolation.
20 0
Country 140 '  ICT' has less than 12 values, skip interpolation.
20 0
Country 140 '  Research and Development' has less than 12 values, skip interpolation.
20 0
Country 148 'Overall index' has less than 12 values, skip interpolation.
20 0
Country 148 '  ICT' has less than 12 values, skip interpolation.
20 0
Country 148 '  Research and Development' has less than 12 values, skip interpolation.
20 0
Country 426 'Overall index' has less than 12 values, skip interpolation.
20 0
Country 426 '  ICT' has less than 12 values, skip interpolation.
20 0
Country 426 '  Research and Development' has less than 12 values, skip interpolation.
20 0
Country 562 'Overall index' has less than 

In [11]:
#### institution variables data
VARIABLES_institution = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='institution'")

data_raw_institution = df_final[VARIABLES_institution["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_institution = (
    data_raw_institution
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
##rename
df_institution = df_institution.rename(columns={"Property Rights": "property"})
df_institution.to_csv(route / "institutiondata.csv")

VARIABLES_institution_2 = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='institution_2'")

data_raw_institution_2 = df_final[VARIABLES_institution_2["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_institution_2 = (
    data_raw_institution_2
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
##rename
df_institution_2 = df_institution_2.rename(
    columns={
        "Environmental Taxes|ECGTE|Environmental Taxes|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes|Percent of GDP": "environmenttax",
    }
)
df_institution_2.to_csv(route / "institutiondata_2.csv")

20 0
Country 24 'Environmental Taxes|ECGTE|Environmental Taxes|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes|Percent of GDP' has less than 12 values, skip interpolation.
17 3
Country 72 'Environmental Taxes|ECGTE|Environmental Taxes|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes|Percent of GDP' has less than 12 values, skip interpolation.
20 0
Country 140 'Environmental Taxes|ECGTE|Environmental Taxes|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes|Percent of GDP' has less than 12 values, skip interpolation.
20 0
Country 204 'Environmental Taxes|ECGTE|Environmental Taxes|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes|Percent of GDP' has less than 12 values, skip interpolation.
20 0
Country 231 'Environmental Taxes|ECGTE|Environmental Taxes|Environment, Climate Change, Government Policy, Taxes, Environmental Taxes|Percent of GDP' has less than 12 values, skip interpolation.
20

In [12]:
#### finance variables data
VARIABLES_finance = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='finance'")

data_raw_finance = df_final[VARIABLES_finance["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_finance = (
    data_raw_finance
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
##rename
df_finance = df_finance.rename(
    columns={"Commercial bank branches (per 100,000 adults)_x": "finance"}
)
df_finance.to_csv(route / "financedata.csv")

VARIABLES_finance_2 = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='finance_2'")

data_raw_finance_2 = df_final[VARIABLES_finance_2["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)
###补齐数据
df_finance_2 = (
    data_raw_finance_2
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
df_finance_2.to_csv(route / "financedata_2.csv")

In [13]:
#### fdi variables data
VARIABLES_fdi = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='fdi'")

data_raw_fdi = df_final[VARIABLES_fdi["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)

data_raw_fdi.loc[data_raw_fdi["mafdistock"] < 1, "mafdistock"] = 1
data_raw_fdi.loc[data_raw_fdi["greenfdistock"] < 1, "greenfdistock"] = 1
###补齐数据
df_fdi = (
    data_raw_fdi
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
##rename
df_fdi = df_fdi.rename(
    columns={
        "Number of announced greenfield FDI projects, by destination": "numgreenfdi",
        "Number of net cross-border M&As by region/economy of seller": "nummergerfdi",
    }
).reset_index()
df_fdi["posti"] = (df_fdi["strategy"] <= df_fdi["Year"]).astype(int)
df_fdi["posti_2015"] = (df_fdi["Year"] >= 2015).astype(int)

df_fdi["treatment_greenfdi"] = (df_fdi["numgreenfdi"] > 0).astype(int)
df_fdi["treatment_mafdi"] = (df_fdi["nummergerfdi"] > 0).astype(int)

df_fdi["lngreenfdistock"] = np.log(df_fdi["greenfdistock"])
df_fdi["lnmafdistock"] = np.log(df_fdi["mafdistock"])

df_fdi.to_csv(route / "fdidata.csv")

20 0
Country 132 'strategy' has less than 12 values, skip interpolation.
20 0
Country 132 'treatment' has less than 12 values, skip interpolation.
20 0
Country 140 'strategy' has less than 12 values, skip interpolation.
20 0
Country 140 'treatment' has less than 12 values, skip interpolation.
20 0
Country 148 'strategy' has less than 12 values, skip interpolation.
20 0
Country 148 'treatment' has less than 12 values, skip interpolation.
20 0
Country 180 'Number of net cross-border M&As by region/economy of seller' has less than 12 values, skip interpolation.
20 0
Country 180 'mafdistock' has less than 12 values, skip interpolation.
20 0
Country 231 'strategy' has less than 12 values, skip interpolation.
20 0
Country 231 'treatment' has less than 12 values, skip interpolation.
20 0
Country 324 'strategy' has less than 12 values, skip interpolation.
20 0
Country 324 'treatment' has less than 12 values, skip interpolation.
20 0
Country 426 'strategy' has less than 12 values, skip interpol

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [14]:
###fdi by different countries
##中国
VARIABLES_FDI_CN = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_CN'")

data_raw_FDI_CN = df_final[VARIABLES_FDI_CN["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
)

df_FDI_CN = (
    data_raw_FDI_CN
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
    .sort_index()
)
df_FDI_CN.loc[df_FDI_CN['stock']<1, "stock"] = np.nan
df_FDI_CN['lnFDI_CN'] = np.log(df_FDI_CN['stock'])

missing_by_country = df_FDI_CN.groupby("Numeric").apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 10).all(axis=1)].index

df_FDI_CN.query('Year>2002').to_csv(route / "FDI_CN_data.csv")

20 0
Country 748 'stock' has less than 12 values, skip interpolation.
13 7
Country 854 'stock' has less than 12 values, skip interpolation.


In [15]:
###美国
VARIABLES_FDI_USBEA = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_USBEA'")



def interpolate_with_linear_regression_us(df, polynomial=False):
    df = df.copy()
    for col in df.columns:
        if df[col].isna().values.any():
            na_mask = df[col].isna()
            if (~na_mask).sum() < 10:
                print(
                    f"Country {df.index[0][0]} '{col}' has less than 10 values, skip interpolation."
                )
                continue
            if not is_numeric_dtype(df[col]):
                print(
                    f"Country {df.index[0][0]} '{col}' is not numeric dtype, skip interpolation."
                )
                continue
            lin_reg = LinearRegression()
            not_na_years = df.loc[~na_mask].index.get_level_values(1).values.reshape((-1, 1))
            na_years = df.loc[na_mask].index.get_level_values(1).values.reshape((-1, 1))
            if polynomial:
                poly = PolynomialFeatures(2, include_bias=False)
                X = poly.fit_transform(not_na_years)
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(poly.fit_transform(na_years))
            else:
                X = not_na_years
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(na_years)
            df.loc[na_mask, col] = pred
    return df

data_raw_FDI_USBEA = df_final[VARIABLES_FDI_USBEA["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year>2000")

data_raw_FDI_USBEA.loc[data_raw_FDI_USBEA['FDI_USBEA']<1, "FDI_USBEA"] = np.nan
data_raw_FDI_USBEA['lnFDI_USBEA'] = np.log(data_raw_FDI_USBEA['FDI_USBEA'])


df_FDI_USBEA = (
    data_raw_FDI_USBEA
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression_us)
    .droplevel(0)
    .sort_index()
)


# missing_by_country = df_FDI_USBEA.groupby("Numeric").apply(lambda x: x.isnull().sum())
# selected_countries = missing_by_country.loc[(missing_by_country < 6).all(axis=1)].index
# df_FDI_USBEA = df_FDI_USBEA.query("Numeric in @selected_countries")

# 
selected_countries = df_FDI_USBEA.groupby("Numeric")["FDI_USBEA"].count() == len(years)
selected_countries = selected_countries[selected_countries].index
df_FDI_USBEA = df_FDI_USBEA.query("Numeric in @selected_countries")

df_FDI_USBEA.to_csv(route / "FDI_USBEA_data.csv")

Country 132 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 132 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 140 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 140 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 148 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 148 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 270 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 270 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 324 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 324 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 450 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 450 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 466 'FDI_USBEA' has less than 10 values, skip interpolation.
Country 466 'lnFDI_USBEA' has less than 10 values, skip interpolation.
Country 562 'FDI_USB

In [29]:
df_FDI_USBEA["lnFDI_USBEA"].isna().sum()

0

In [16]:
###美国
VARIABLES_FDI_US = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_US'")

def interpolate_with_linear_regression_us(df, polynomial=False):
    df = df.copy()
    for col in df.columns:
        if df[col].isna().values.any():
            na_mask = df[col].isna()
            if (~na_mask).sum() < 6:
                print(
                    f"Country {df.index[0][0]} '{col}' has less than 6 values, skip interpolation."
                )
                continue
            if not is_numeric_dtype(df[col]):
                print(
                    f"Country {df.index[0][0]} '{col}' is not numeric dtype, skip interpolation."
                )
                continue
            lin_reg = LinearRegression()
            not_na_years = df.loc[~na_mask].index.get_level_values(1).values.reshape((-1, 1))
            na_years = df.loc[na_mask].index.get_level_values(1).values.reshape((-1, 1))
            if polynomial:
                poly = PolynomialFeatures(2, include_bias=False)
                X = poly.fit_transform(not_na_years)
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(poly.fit_transform(na_years))
            else:
                X = not_na_years
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(na_years)
            df.loc[na_mask, col] = pred
    return df

data_raw_FDI_US = df_final[VARIABLES_FDI_US["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year>2008")

df_FDI_US = (
    data_raw_FDI_US
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression_us)
    .droplevel(0)
    .sort_index()
)
df_FDI_US.loc[df_FDI_US['FDI_US']<1, "FDI_US"] = np.nan

df_FDI_US['lnFDI_US'] = np.log(df_FDI_US['FDI_US'])
missing_by_country = df_FDI_US.groupby("Numeric").apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 6).all(axis=1)].index
df_FDI_US.to_csv(route / "FDI_US_data.csv")
df_FDI_US

Country 140 'FDI_US' has less than 6 values, skip interpolation.
Country 148 'FDI_US' has less than 6 values, skip interpolation.
Country 180 'FDI_US' has less than 6 values, skip interpolation.
Country 231 'FDI_US' has less than 6 values, skip interpolation.
Country 450 'FDI_US' has less than 6 values, skip interpolation.
Country 716 'FDI_US' has less than 6 values, skip interpolation.
Country 748 'FDI_US' has less than 6 values, skip interpolation.
Country 768 'FDI_US' has less than 6 values, skip interpolation.


Unnamed: 0_level_0,Unnamed: 1_level_0,FDI_US,FDIstock,economic,WGI,natural,urban,military,incomegroup,IMFgroup,landlock,...,incomegroup2012,incomegroup2013,incomegroup2014,incomegroup2015,incomegroup2016,incomegroup2017,incomegroup2018,incomegroup2019,incomegroup2020,lnFDI_US
Numeric,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
24,2009,2.540000e+09,19289.71180,72.758781,-1.019059,30.311466,4.975504,4.709610,LM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,21.655430
24,2010,4.460000e+09,32457.88286,37.982163,-1.025741,38.162310,4.995581,4.177585,LM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,22.218415
24,2011,5.430000e+09,43275.21878,22.556918,-1.074126,39.022678,4.997269,3.255660,UM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,22.415205
24,2012,8.460000e+08,41810.59079,-67.378877,-1.009019,34.731008,4.973864,3.236659,UM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,20.556030
24,2013,1.214000e+09,34690.57336,-69.202350,-1.070175,29.400045,4.926419,4.455239,UM,1.0,0.0,...,1,1,1,1,1,1,1,1,1,20.917187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,2016,6.100000e+07,18855.10083,58.895006,-0.348764,11.677213,4.406529,1.429305,LM,1.0,1.0,...,1,1,1,1,1,1,1,1,1,17.926384
894,2017,6.000000e+07,19866.47730,98.386222,-0.371218,13.226440,4.373369,1.309596,LM,1.0,1.0,...,1,1,1,1,1,1,1,1,1,17.909855
894,2018,4.900000e+07,20435.42730,116.968766,-0.398209,11.748346,4.322060,1.409462,LM,1.0,1.0,...,1,1,1,1,1,1,1,1,1,17.707331
894,2019,4.200000e+07,19134.26163,96.294570,-0.476908,7.915065,4.265728,1.219375,LM,1.0,1.0,...,1,1,1,1,1,1,1,1,1,17.553180


In [17]:
###法国
VARIABLES_FDI_FR = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_FR'")

data_raw_FDI_FR = df_final[VARIABLES_FDI_FR["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year>2008")

df_FDI_FR = (
    data_raw_FDI_FR
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression_us)
    .droplevel(0)
    .sort_index()
)
df_FDI_FR.loc[df_FDI_FR['FDI_FR']<1, "FDI_FR"] = np.nan

df_FDI_FR['lnFDI_FR'] = np.log(df_FDI_FR['FDI_FR'])
missing_by_country = df_FDI_FR.groupby("Numeric").apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 6).all(axis=1)].index
df_FDI_FR.to_csv(route / "FDI_FR_data.csv")

Country 72 'FDI_FR' has less than 6 values, skip interpolation.
Country 132 'FDI_FR' has less than 6 values, skip interpolation.
Country 140 'FDI_FR' has less than 6 values, skip interpolation.
Country 148 'FDI_FR' has less than 6 values, skip interpolation.
Country 231 'FDI_FR' has less than 6 values, skip interpolation.
Country 270 'FDI_FR' has less than 6 values, skip interpolation.
Country 324 'FDI_FR' has less than 6 values, skip interpolation.
Country 426 'FDI_FR' has less than 6 values, skip interpolation.
Country 454 'FDI_FR' has less than 6 values, skip interpolation.
Country 516 'FDI_FR' has less than 6 values, skip interpolation.
Country 624 'FDI_FR' has less than 6 values, skip interpolation.
Country 646 'FDI_FR' has less than 6 values, skip interpolation.
Country 694 'FDI_FR' has less than 6 values, skip interpolation.
Country 716 'FDI_FR' has less than 6 values, skip interpolation.
Country 748 'FDI_FR' has less than 6 values, skip interpolation.
Country 768 'FDI_FR' has l

In [18]:
###荷兰
VARIABLES_FDI_NL = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='FDI_NL'")

data_raw_FDI_NL = df_final[VARIABLES_FDI_NL["Variables"]].merge(
    data_interpolated, on=["Numeric", "Year"], how="right"
).query("Year>2008")

df_FDI_NL = (
    data_raw_FDI_NL
    .groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression_us)
    .droplevel(0)
    .sort_index()
)
df_FDI_NL.loc[df_FDI_NL['FDI_NL']<1, "FDI_NL"] = np.nan

df_FDI_NL['lnFDI_NL'] = np.log(df_FDI_NL['FDI_NL'])
missing_by_country = df_FDI_NL.groupby("Numeric").apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 6).all(axis=1)].index
df_FDI_NL.to_csv(route / "FDI_NL_data.csv")

In [19]:
data_interpolated["CountryName_CN"] = df_final["CountryName_CN"]
data_interpolated["CountryName_CN"].unique()

array(['安哥拉', '博茨瓦纳', '喀麦隆', '佛得角', '中非', '乍得', '刚果（布）', '刚果（金）', '贝宁',
       '埃塞俄比亚', '加蓬', '冈比亚', '加纳', '几内亚', '肯尼亚', '莱索托', '马达加斯加', '马拉维',
       '马里', '毛里求斯', '纳米比亚', '尼日尔', '尼日利亚', '几内亚（比绍）', '卢旺达', '塞内加尔',
       '塞拉利昂', '南非', '津巴布韦', '斯威士兰', '多哥', '乌干达', '坦桑尼亚', '布基纳法索', '赞比亚'],
      dtype=object)

import matplotlib.pyplot as plt
for col in pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Sheet2",
    na_values="..",
).query("一级指标 == '经济'")["Variables"]:
    if col not in data_interpolated:
        print(col)
        continue
    fig, ax = plt.subplots()
    sns.scatterplot(data=data_interpolated, x="lnFDIstock", y=col, hue="Numeric", ax=ax)

In [20]:
data_filled = (
    pd.read_excel(route / "data_filled.xlsx")
    .iloc[:, 2:]
    .rename(columns={"Numeric.1": "Numeric"})
)

In [21]:
data_corrplot = data_filled.merge(
    df_final[[*VARIABLES["Variables"], "FDIstock"]], on=["Numeric", "Year"]
)
Variables_corrplot = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Sheet2",
    na_values="..",
)

In [22]:
economy_variables = Variables_corrplot.query("一级指标 == '经济'")["Variables"]
society_variables = Variables_corrplot.query("一级指标 == '社会'")["Variables"]
resource_variables = Variables_corrplot.query("一级指标 == '资源'")["Variables"]
ecology_variables = Variables_corrplot.query("一级指标 == '生态'")["Variables"]

In [23]:
# df_final = (
#     pd.read_excel(Path.home() / "Desktop" / "effects variables counting.xlsx")
# ).set_index(['Category','变量名'])
# df_final['count'] = df_final.count(axis='columns').sort_values()
# df_final.to_excel(Path.home() / "Desktop" / "counting.xlsx")

sns.heatmap(
    data_corrplot.drop(
        [
            "Numeric",
            "Year",
            "CountryName_CN",
            "Alpha-3 code",
            "incomegroup",
            "IMFgroup",
        ],
        axis=1,
    )[["lnFDIstock", *economy_variables]].corr(),
    cmap=sns.color_palette("icefire", as_cmap=True),
)

In [24]:
###生成heckman两阶段数据
variables_heckman = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="regvariables",
    na_values="..",
).query("二级指标=='基准回归'")

country_heckman = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Countries",
    na_values="..",
)

data_heckman = (
    df_final[variables_heckman["Variables"]]
    .reset_index()
    .merge(country_heckman[["Numeric", "Alpha-3 code", "CountryName_CN"]], on="Numeric")
    .query("Numeric!=175 & Numeric!=728")
)
# data_heckman["war_intensity"] = data_heckman["war_intensity"].fillna(0)
data_heckman.loc[:, data_heckman.isnull().any()].isnull().sum().sort_values()

FDIstock                                      11545
landlock                                      12091
incomegroup                                   12118
Urban population growth (annual %)_x          12165
IMFgroup                                      12199
Total natural resources rents (% of GDP)_x    12205
WGI                                           12306
taxpercent                                    12352
Military expenditure (% of GDP)               12381
dtype: int64

In [25]:
data_heckman_drop = data_heckman.copy().set_index(["Numeric", "Year"])
# data_filtered.to_excel(route / "raw_data.xlsx", index=False)


dropped_countries = defaultdict(list)
for numeric, new_df in data_heckman_drop.groupby(level=0):
    for col in new_df:
        continuous_nan = 0
        num_nan = 0
        current_continuous_nan = 0
        for year in years:
            if pd.isna(new_df.loc[(numeric, year), col]):
                num_nan += 1
                current_continuous_nan += 1
                if current_continuous_nan > continuous_nan:
                    continuous_nan = current_continuous_nan
            else:
                current_continuous_nan = 0
        if num_nan > 6 or continuous_nan > 5:
            dropped_countries[col].append(numeric)
mask = np.unique(np.array([v for value in dropped_countries.values() for v in value]))
print(52 - len(mask))
data_heckman_drop = data_heckman_drop.query("Numeric not in @mask").reset_index()

38


In [26]:
data_heckman_merge = data_heckman_drop.merge(
    data1[
        [
            "Numeric",
            "Year",
            "SDI",
            "SDI_Economy",
            "SDI_Society",
            "SDI_Resource",
            "SDI_Ecology",
        ]
    ],
    on=["Numeric", "Year"],
    how="left",
).set_index(["SDI", "SDI_Economy", "SDI_Society", "SDI_Resource", "SDI_Ecology"])
data_heckman_merge.loc[
    :, data_heckman_merge.isnull().any()
].isnull().sum().sort_values()

FDIstock                                      8497
incomegroup                                   8944
IMFgroup                                      8944
landlock                                      8944
Total natural resources rents (% of GDP)_x    8979
Urban population growth (annual %)_x          8979
taxpercent                                    9014
Military expenditure (% of GDP)               9035
WGI                                           9084
dtype: int64

In [27]:
data_heckman_interpo = (
    data_heckman_merge.groupby("Numeric", group_keys=True)
    .apply(interpolate_with_linear_regression)
    .drop(["Numeric"], axis=1)
    .reset_index()
)
data_heckman_interpo.loc[
    :, data_heckman_interpo.isnull().any()
].isnull().sum().sort_values()

data_heckman_interpo["lnFDI instock (Millions of dollars)"] = np.log(data_heckman_interpo["FDI instock (Millions of dollars)"] + 1)

data_heckman_interpo["lngnic"] = np.log(data_heckman_interpo["GNI per capita (US$)"])

data_heckman_interpo = data_heckman_interpo.rename(
    columns={
        "Urban population growth (annual %)_x": "urban",
        "Total natural resources rents (% of GDP)_x": "natural",
        "Commercial bank branches (per 100,000 adults)_x": "financial",
        "Statistical Capacity Score (Overall Average) (scale 0 - 100)_x": "stats",
        "Military expenditure (% of GDP)": "military",
    }
)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Generate SSA and Region dummies
data_heckman_interpo["SDI_dum"] = data_heckman_interpo["SDI"].apply(
    lambda r: 1 if r > 0 else 0
)

In [None]:
data_heckman_interpo  # .query("SDI_dum==0")

In [None]:
data_heckman_interpo.to_csv(route / "heckmandata.csv", index=True)
data_heckman_interpo.reset_index().to_excel(route / "heckmandata.xlsx")

In [None]:
data_heckman_interpo["CountryName_CN"].unique()

In [None]:
data_heckman_interpo.describe()