In [1]:
%matplotlib inline
from functools import partial, reduce
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.linear_model import LinearRegression

route = Path.home() / "OneDrive" / "PhD Dissertation" / "Data_Code" / "Data"
if not route.exists():
    route.mkdir(parents=True)

df_final = (
    pd.read_csv(Path.home() / "OneDrive" / "Rawdata" / "Data cleaning" / "df_final.csv")
    .set_index(["Numeric", "Year"])
)


df_final = df_final.sort_index(level=["Numeric", "Year"])
df_final.drop(
    [col for col in df_final.columns if col.endswith("_y")], axis=1, inplace=True
)
years = list(range(2001, 2021))

  pd.read_csv(Path.home() / "OneDrive" / "Rawdata" / "Data cleaning" / "df_final.csv")


In [2]:
df_final[
    "Net official development assistance and official aid received (current US$)(% of GNI)"
] = (
    df_final[
        "Net official development assistance and official aid received (current US$)_x"
    ]
    / df_final["GNI (current US$)"]
)


df_final["gini_pct_change"] = df_final["gini"].groupby("Numeric").pct_change()


df_final["Scientific and technical journal articles_pct_change"] = (
    df_final["Scientific and technical journal articles"]
    .groupby("Numeric")
    .pct_change()
)


df_final["Life expectancy at birth, total (years)_pct_change"] = (
    df_final["Life expectancy at birth, total (years)"].groupby("Numeric").pct_change()
)

df_final[
    "Solar, tide, wave, fuel cell electricity installed capacity (million kilowatts)_per_capita"
] = (
    df_final[
        "Solar, tide, wave, fuel cell electricity installed capacity (million kilowatts)"
    ]
    / df_final["Population, total"]
) * 10_000

df_final[
    "Biomass and waste electricity installed capacity (million kilowatts)_per_capita"
] = (
    df_final["Biomass and waste electricity installed capacity (million kilowatts)"]
    / df_final["Population, total"]
) * 10_000

df_final[
    "Biomass and waste electricity net generation (million metric tons of oil equivalent)_per_capita"
] = (
    df_final[
        "Biomass and waste electricity net generation (million metric tons of oil equivalent)"
    ]
    / df_final["Population, total"]
) * 10_000

df_final["Red List Index_pct_change"] = df_final.groupby("Numeric")[
    "sdg15_redlist"
].pct_change()

df_final["Methane emissions (kt of CO2 equivalent)_per_capita"] = (
    df_final["Methane emissions (kt of CO2 equivalent)"] / df_final["Population, total"]
)

df_final["Black Carbon Emissions_per_capita"] = (
    df_final["Black Carbon Emissions"] / df_final["Population, total"]
)

df_final["NOx emissions [Gg]_per_capita"] = (
    df_final["NOx emissions [Gg]"] / df_final["Population, total"]
)

df_final["N2O emissions [Gg]_per_capita"] = (
    df_final["N2O emissions [Gg]"] / df_final["Population, total"]
)

df_final["Electricity distribution losses (billion kilowatthours)_percent"] = (
    df_final["Electricity distribution losses (billion kilowatthours)"]
    / df_final["Electricity net generation (billion kilowatthours)"]
) * 100


df_final["Wetland area（% of land area)"] = (
    df_final["Wetland area [km2]"] / df_final["Land area (sq. km)"]
) * 100

df_final["Grassland area（% of land area)"] = (
    df_final["Grassland area [km2]"] / df_final["Land area (sq. km)"]
) * 100

df_final[
    "Terrestrial barren land|1000 HA|ECCCT|Terrestrial Barren Land|Environment, Climate Change, Climate Indicators, Land Cover Accounts, Terrestrial Barren Land|Climate neutral(% of land area)"
] = (
    df_final[
        "Terrestrial barren land|1000 HA|ECCCT|Terrestrial Barren Land|Environment, Climate Change, Climate Indicators, Land Cover Accounts, Terrestrial Barren Land|Climate neutral"
    ]
    / df_final["Land area (sq. km)"]
) * 100


df_final["Total area of all Marine Protected Areas in a country(% of land area)"] = (
    df_final["Total area of all Marine Protected Areas in a country"]
    / df_final["Land area (sq. km)"]
) * 100


df_final["Labor force(% of total population)"] = (
    df_final["Labor force, total"] / df_final["Population, total"]
)

df_final[
    "Net official development assistance and official aid received (current US$)(% of GNI)"
] = (
    df_final[
        "Net official development assistance and official aid received (current US$)_x"
    ]
    / df_final["GNI (current US$)"]
)

df_final['Merchandise exports (% of GDP)'] = (df_final['Merchandise exports (current US$)']/df_final['GDP (current US$)_x'])

num_of_coastal = len(df_final.query("landlock == 0").index.get_level_values(0).unique())
avg_marine_protected = df_final.groupby("Year")["Total area of all Marine Protected Areas in a country(% of land area)"].sum().replace(0, np.nan).dropna() / num_of_coastal
for year, value in avg_marine_protected.items():
    df_final.loc[(df_final["landlock"] == 1, year), "Total area of all Marine Protected Areas in a country(% of land area)"] = value


  df_final[
  df_final["gini_pct_change"] = df_final["gini"].groupby("Numeric").pct_change()
  df_final["Scientific and technical journal articles_pct_change"] = (
  df_final["Life expectancy at birth, total (years)_pct_change"] = (
  df_final[
  df_final[
  df_final[
  df_final["Red List Index_pct_change"] = df_final.groupby("Numeric")[
  df_final["Methane emissions (kt of CO2 equivalent)_per_capita"] = (
  df_final["Black Carbon Emissions_per_capita"] = (
  df_final["NOx emissions [Gg]_per_capita"] = (
  df_final["N2O emissions [Gg]_per_capita"] = (
  df_final["Electricity distribution losses (billion kilowatthours)_percent"] = (
  df_final["Wetland area（% of land area)"] = (
  df_final["Grassland area（% of land area)"] = (
  df_final[
  df_final["Total area of all Marine Protected Areas in a country(% of land area)"] = (
  df_final["Labor force(% of total population)"] = (
  df_final['Merchandise exports (% of GDP)'] = (df_final['Merchandise exports (current US$)']/df_final['GDP (cu

In [3]:
###Filter the data
# Get variables and country names
VARIABLES = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Sheet2",
    na_values="..",
)

country = pd.read_excel(
    route / "Variables Selection" / "Variables Chosen.xlsx",
    sheet_name="Countries",
    na_values="..",
)
dfgeo = pd.read_excel(
    Path.home() / "OneDrive" / "Rawdata" / "Country Classification" / "UN Classification_Natural resources_Geography.xlsx",
    sheet_name="Sheet1",
    na_values="..",
    usecols="D, E",
).query("Region!= 'N'")
country = country.merge(dfgeo, on=['Numeric'])

In [4]:
##Compute which variables can be used

variables = (
    df_final.reset_index()
    .query("Year in @years")
    .groupby("Numeric")
    .count()
    .sum()
    .sort_values(ascending=False)
    .reset_index()
    .rename(columns={0: "count"})
)
variables["count"] = variables["count"] / len(years)
variables = variables.query("count>0")


##Add the Chinese names for the variables
if (route / "Variables Selection" / "WDI_CN.xlsx").exists():
    variables = (
        pd.read_excel(route / "Variables Selection" / "WDI_CN.xlsx")[
            ["Indicator Name", "Chinese"]
        ]
        .rename(columns={"Indicator Name": "index"})
        .merge(variables, how="right", on=["index"])
        # .merge(df_varaiable_name, how ="left", on=['index'])
    )
variables.to_excel(route / "Variables Count.xlsx", index=False)

In [5]:
VARIABLES[["一级指标", "二级指标", "三级指标", "来源"]].set_index(["一级指标", "二级指标", "三级指标"]).to_excel(
    route / "Variables_name.xlsx", index=True
)

In [6]:
data_raw = (
    df_final[VARIABLES.query("变量类型=='指标体系'")["Variables"]]
    .reset_index()
    .astype({"Numeric": int, "Year": int})
    .merge(country[["Numeric", "Alpha-3 code", "CountryName_CN"]], on="Numeric")
    .rename(columns={"Alpha-3 code_y": "Alpha-3 code"})
)
data_filtered = (
    data_raw.copy()
    .query("Year in @years")
    .set_index(["Numeric", "Year"])
)

In [7]:
# 统计每个国家的变量的缺失值数量
missing_by_country = data_filtered.groupby('Numeric').apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 10).all(axis=1)].index
data_filtered = data_filtered.query("Numeric in @selected_countries")


In [8]:
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import PolynomialFeatures

def interpolate_with_linear_regression(df, polynomial=False):
    df = df.copy()
    for col in df.columns:
        if df[col].isna().values.any():
            na_mask = df[col].isna()
            # if col in ("NOE", "SOE", "Lead exposure"):
            #     lin_reg = LinearRegression()
            #     lin_reg.fit(
            #         df.loc[
            #             ~na_mask, ("CO2 emissions (metric tons per capita)", "Year")
            #         ],
            #         df.loc[~na_mask, col],
            #     )
            #     pred = lin_reg.predict(
            #         df.loc[na_mask, ("CO2 emissions (metric tons per capita)", "Year")]
            #     )
            # else:
            if (~na_mask).sum() < 10:
                print(
                    f"Country {df.index[0][0]} '{col}' has less than 10 values, skip interpolation."
                )
                continue
            if not is_numeric_dtype(df[col]):
                print(
                    f"Country {df.index[0][0]} '{col}' is not numeric dtype, skip interpolation."
                )
                continue
            lin_reg = LinearRegression()
            not_na_years = df.loc[~na_mask].index.get_level_values(1).values.reshape((-1, 1))
            na_years = df.loc[na_mask].index.get_level_values(1).values.reshape((-1, 1))
            if polynomial:
                poly = PolynomialFeatures(2, include_bias=False)
                X = poly.fit_transform(not_na_years)
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(poly.fit_transform(na_years))
            else:
                X = not_na_years
                lin_reg.fit(X, df.loc[~na_mask, col])
                pred = lin_reg.predict(na_years)
            df.loc[na_mask, col] = pred
    return df


data_filled = data_filtered.groupby("Numeric").apply(
    interpolate_with_linear_regression
).droplevel(0)


# data_filled.to_excel(route / "data_filled.xlsx", index=True)
data_filled

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP per capita growth (annual %)_x,Merchandise exports (% of GDP),Final consumption expenditure (% of GDP),"Inflation, GDP deflator (annual %)","Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)_x","Industry (including construction), value added per worker (constant 2015 US$)_x","Services, value added per worker (constant 2015 US$)_x","Current account balance, percent of GDP (Percent of GDP)(IMF)",Net official development assistance and official aid received (current US$)(% of GNI),"Employment to population ratio, 15+, total (%) (modeled ILO estimate)",...,Grassland area（% of land area),"Terrestrial barren land|1000 HA|ECCCT|Terrestrial Barren Land|Environment, Climate Change, Climate Indicators, Land Cover Accounts, Terrestrial Barren Land|Climate neutral(% of land area)",CO2 emissions (metric tons per capita)_x,PM2.5 exposure/Ambient particulate matter pollution,Lead exposure,Terrestrial biome protection (global weights),Species Protection Index,Total area of all Marine Protected Areas in a country(% of land area),Alpha-3 code,CountryName_CN
Numeric,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
24,2001,0.838227,0.731193,69.064688,106.352126,1494.697992,39255.040555,6390.539635,-13.1,0.038336,74.111,...,12.957053,0.223514,0.941818,786.047340,431.754245,6.219382,31.75,0.000112,AGO,安哥拉
24,2002,9.937307,0.544827,60.638427,196.984099,1471.605629,33387.651692,6164.292122,-1.0,0.030330,74.156,...,12.964688,0.223551,0.895717,778.402552,421.049537,6.219382,31.75,0.000112,AGO,安哥拉
24,2003,-0.466061,0.533776,64.807441,93.926567,1540.353428,31721.338480,6267.634440,-4.0,0.030678,74.137,...,12.947625,0.226451,0.924988,778.903737,418.034543,6.219382,31.75,0.000112,AGO,安哥拉
24,2004,7.127077,0.572137,55.925555,33.443595,1634.750461,34504.159974,6490.114102,2.9,0.054326,74.172,...,12.916447,0.230279,0.929811,797.490554,418.916453,6.219382,31.75,0.000112,AGO,安哥拉
24,2005,11.009566,0.652107,47.981847,42.374249,1565.000623,41292.012964,7017.500233,13.9,0.012589,74.198,...,12.911064,0.230565,0.812875,793.005335,406.639624,6.219382,31.75,0.000112,AGO,安哥拉
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,2016,0.561271,0.304031,67.483271,13.552485,360.610819,11931.994974,5463.243167,-3.3,0.047582,53.846,...,1.136406,0.000620,0.316995,1077.074576,272.061050,17.000000,90.38,14.859503,ZMB,赞比亚
894,2017,0.331284,0.309195,60.702045,10.095730,390.497616,12234.074948,5315.596781,-1.7,0.042079,53.642,...,1.132151,0.000593,0.393726,1069.492256,263.086573,17.000000,90.38,14.859503,ZMB,赞比亚
894,2018,0.897347,0.343348,57.676746,7.411571,302.050290,12506.683873,5431.540484,-1.3,0.038615,55.538,...,1.131198,0.000571,0.440527,1070.707346,254.626303,17.000000,90.38,15.362832,ZMB,赞比亚
894,2019,-1.564237,0.301991,59.933221,7.633470,316.674603,11755.633174,5348.956264,0.4,0.041383,56.241,...,1.132742,0.000594,0.414336,1084.600130,246.125887,17.000000,90.38,15.362832,ZMB,赞比亚


In [9]:
print(len(data_filled.index.get_level_values(0).unique()))

data_filled.loc[:, data_filled.isnull().any()].isnull().sum().sort_values()

33


Series([], dtype: float64)

In [10]:
### Calculate coefficient of variance
from functools import partial

from scipy.stats import variation

data_CV = data_filled.reset_index().set_index(["Alpha-3 code", "Numeric", "Year", "CountryName_CN"])
coefva = (
    pd.DataFrame(data_CV.apply(partial(variation, ddof=1)))
    .abs()
    .rename(columns={0: "coefva"})
)
small_coefva = coefva.query("coefva < 0.25").index
data_CV = data_filled[[col for col in data_filled.columns if col not in small_coefva]]
variables_post_cv = [
    col
    for col in data_filled.columns
    if col not in small_coefva
    if col
    not in [
        "Numeric",
        "Year",
        "Alpha-3 code",
        "CountryName_CN",
        "Region",
        "incomegroup",
    ]
]
(
    VARIABLES.query(
        '类型 in ["正向", "负向"] and 变量类型 == "指标体系" and Variables in @variables_post_cv'
    )
    .set_index(["一级指标", "二级指标", "三级指标"])
    .drop(columns=["变量类型"])
    .apply(lambda x: np.abs(variation(data_CV[x["Variables"]], ddof=1)), axis=1)
    .to_frame()
    .round(2)
).to_excel(route / "Variables_CV.xlsx", index=True)

variables_en_to_cn = {
    record["Variables"]: record["三级指标"]
    for record in VARIABLES[["三级指标", "Variables"]].to_dict("records")
}
"、".join(variables_en_to_cn[i] for i, v in coefva.query("coefva < 0.25").iterrows())
# + f"（{v['coefva']:.2f}）"

'最终消费支出占GDP百分比、就业率、基尼系数、劳动力比例、道路交通伤害造成的死亡率(每10万人)'

In [11]:
data_CV

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP per capita growth (annual %)_x,Merchandise exports (% of GDP),"Inflation, GDP deflator (annual %)","Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)_x","Industry (including construction), value added per worker (constant 2015 US$)_x","Services, value added per worker (constant 2015 US$)_x","Current account balance, percent of GDP (Percent of GDP)(IMF)",Net official development assistance and official aid received (current US$)(% of GNI),Proportion of seats held by women in national parliaments (%)_x,Prevalence of undernourishment (percent) (3-year average),...,Grassland area（% of land area),"Terrestrial barren land|1000 HA|ECCCT|Terrestrial Barren Land|Environment, Climate Change, Climate Indicators, Land Cover Accounts, Terrestrial Barren Land|Climate neutral(% of land area)",CO2 emissions (metric tons per capita)_x,PM2.5 exposure/Ambient particulate matter pollution,Lead exposure,Terrestrial biome protection (global weights),Species Protection Index,Total area of all Marine Protected Areas in a country(% of land area),Alpha-3 code,CountryName_CN
Numeric,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
24,2001,0.838227,0.731193,106.352126,1494.697992,39255.040555,6390.539635,-13.1,0.038336,15.454545,63.5,...,12.957053,0.223514,0.941818,786.047340,431.754245,6.219382,31.75,0.000112,AGO,安哥拉
24,2002,9.937307,0.544827,196.984099,1471.605629,33387.651692,6164.292122,-1.0,0.030330,15.454545,59.1,...,12.964688,0.223551,0.895717,778.402552,421.049537,6.219382,31.75,0.000112,AGO,安哥拉
24,2003,-0.466061,0.533776,93.926567,1540.353428,31721.338480,6267.634440,-4.0,0.030678,15.454545,55.4,...,12.947625,0.226451,0.924988,778.903737,418.034543,6.219382,31.75,0.000112,AGO,安哥拉
24,2004,7.127077,0.572137,33.443595,1634.750461,34504.159974,6490.114102,2.9,0.054326,15.000000,52.6,...,12.916447,0.230279,0.929811,797.490554,418.916453,6.219382,31.75,0.000112,AGO,安哥拉
24,2005,11.009566,0.652107,42.374249,1565.000623,41292.012964,7017.500233,13.9,0.012589,15.000000,49.8,...,12.911064,0.230565,0.812875,793.005335,406.639624,6.219382,31.75,0.000112,AGO,安哥拉
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,2016,0.561271,0.304031,13.552485,360.610819,11931.994974,5463.243167,-3.3,0.047582,17.964072,29.4,...,1.136406,0.000620,0.316995,1077.074576,272.061050,17.000000,90.38,14.859503,ZMB,赞比亚
894,2017,0.331284,0.309195,10.095730,390.497616,12234.074948,5315.596781,-1.7,0.042079,17.964072,28.8,...,1.132151,0.000593,0.393726,1069.492256,263.086573,17.000000,90.38,14.859503,ZMB,赞比亚
894,2018,0.897347,0.343348,7.411571,302.050290,12506.683873,5431.540484,-1.3,0.038615,17.964072,29.2,...,1.131198,0.000571,0.440527,1070.707346,254.626303,17.000000,90.38,15.362832,ZMB,赞比亚
894,2019,-1.564237,0.301991,7.633470,316.674603,11755.633174,5348.956264,0.4,0.041383,17.964072,30.0,...,1.132742,0.000594,0.414336,1084.600130,246.125887,17.000000,90.38,15.362832,ZMB,赞比亚


In [12]:
len(coefva.query("coefva < 0.25").reset_index()["index"].unique())  # .round(2)

5

In [13]:
data_CV.describe().to_excel(route / "data_CV.xlsx", index=True)

In [14]:
### Calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

dropped_variables = []


def drop_according_VIF(variables):
    df = data_CV[list(set(data_CV.columns) & set(variables["Variables"]))]
    vif = pd.DataFrame()
    vif["Variables"] = df.columns
    vif["VIF"] = np.nan
    if df.shape[1] >= 2:
        vif["VIF"] = [
            variance_inflation_factor(df.values, i) for i in range(df.shape[1])
        ]
        vif["r_square"] = 1 - 1 / vif["VIF"]
        max_vif = vif.loc[vif["VIF"].idxmax()]
        if max_vif["VIF"] > 7.5:
            print(
                f"Drop variable '{max_vif['Variables']}' with VIF {max_vif['VIF']:.2f}"
            )
            dropped_variables.append(max_vif)
            variables = drop_according_VIF(
                variables[variables["Variables"] != max_vif["Variables"]]
            )
    return variables


def calculate_VIF(row):
    variables = VARIABLES[
        np.logical_and(
            VARIABLES["一级指标"] == row["一级指标"],
            VARIABLES["Variables"].isin(variables_post_VIF),
        )
    ]["Variables"].tolist()
    if len(variables) >= 7.5:
        var = row["Variables"]
        for v in dropped_variables:
            if var == v["Variables"]:
                return v["VIF"]
        var_index = variables.index(var)
        return variance_inflation_factor(data_filled[variables].values, var_index)
    else:
        return np.nan


variables_post_VIF = list(
    VARIABLES.query(
        '类型 in ["正向", "负向"] and 变量类型 == "指标体系" and Variables in @variables_post_cv'
    )
    .groupby("一级指标", group_keys=False)
    .apply(drop_according_VIF)["Variables"]
)

calculated_VIF = (
    VARIABLES.query("Variables in @variables_post_cv")
    .apply(calculate_VIF, axis=1)
    .to_frame()
)
VIF_RESULT = (
    pd.concat(
        [VARIABLES.query("Variables in @variables_post_cv"), calculated_VIF], axis=1
    )
    .rename(columns={0: "VIF"})[["一级指标", "二级指标", "三级指标", "VIF"]]
    .set_index(["一级指标", "二级指标", "三级指标"])
)  #

VIF_RESULT["R^2"] = 1 - 1 / VIF_RESULT["VIF"]

variables_en_to_cn = {
    record["Variables"]: record["三级指标"]
    for record in VARIABLES[["三级指标", "Variables"]].to_dict("records")
}

"、".join(variables_en_to_cn[v["Variables"]] for v in dropped_variables)
# + f"（{v['VIF']:.2f}）"

Drop variable 'Species Protection Index' with VIF 27.41
Drop variable 'PM2.5 exposure/Ambient particulate matter pollution' with VIF 8.64
Drop variable 'People using at least basic drinking water services (% of population)_x' with VIF 37.33
Drop variable 'Mean years schooling' with VIF 15.80
Drop variable 'Mobile cellular subscriptions (per 100 people)' with VIF 8.08
Drop variable 'Services, value added per worker (constant 2015 US$)_x' with VIF 9.70


'物种保护指数、PM2.5暴露量、使用基本饮用水服务的人口比例、成年人平均受教育年限、移动手机使用数、每工人服务业增加值'

In [15]:
calculated_VIF

Unnamed: 0,0
0,
1,
3,
4,
5,
6,
7,
8,
11,3.756935
12,5.4343


In [16]:
len(dropped_variables)  # .round(2)

6

In [17]:
v = VARIABLES.query("Variables in @variables_post_VIF")
v["VIF"] = v.apply(calculate_VIF, axis=1)
v["R^2"] = 1 - 1 / v["VIF"]
v[["一级指标", "二级指标", "三级指标", "VIF", "R^2"]].set_index(["一级指标", "二级指标", "三级指标"]).round(
    2
).to_excel(route / "Variables_VIF.xlsx", index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  v["VIF"] = v.apply(calculate_VIF, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  v["R^2"] = 1 - 1 / v["VIF"]


In [18]:
VARIABLES_POST_CV_VIF = VARIABLES.query(
    "Variables in @variables_post_VIF and 变量类型 == '指标体系'"
).reset_index()
VARIABLES_POST_CV_VIF[["一级指标", "二级指标", "三级指标", "类型", "来源"]].set_index(
    ["一级指标", "二级指标", "三级指标"]
).to_excel(route / "Variables_vif2.xlsx", index=True)

In [19]:
data1 = (
    data_raw.copy()
    .query("Year >= @years[0] and Year <= @years[-1]")
    .set_index(["Numeric", "Year", "Alpha-3 code", "CountryName_CN"])[
        VARIABLES_POST_CV_VIF["Variables"]
    ]
    .reset_index()
)
data1.to_excel(route / "data1.xlsx", index=True)
data1 = data1.set_index(["Numeric", "Year"])
data1

Unnamed: 0_level_0,Unnamed: 1_level_0,Alpha-3 code,CountryName_CN,GDP per capita growth (annual %)_x,Merchandise exports (% of GDP),"Inflation, GDP deflator (annual %)","Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)_x","Industry (including construction), value added per worker (constant 2015 US$)_x","Current account balance, percent of GDP (Percent of GDP)(IMF)",Net official development assistance and official aid received (current US$)(% of GNI),Proportion of seats held by women in national parliaments (%)_x,...,"Solar, tide, wave, fuel cell electricity installed capacity (million kilowatts)_per_capita",Biomass and waste electricity net generation (million metric tons of oil equivalent)_per_capita,Forest area (% of land area)_x,Wetland area（% of land area),Grassland area（% of land area),"Terrestrial barren land|1000 HA|ECCCT|Terrestrial Barren Land|Environment, Climate Change, Climate Indicators, Land Cover Accounts, Terrestrial Barren Land|Climate neutral(% of land area)",CO2 emissions (metric tons per capita)_x,Lead exposure,Terrestrial biome protection (global weights),Total area of all Marine Protected Areas in a country(% of land area)
Numeric,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
24,2001,AGO,安哥拉,0.838227,0.731193,106.352126,,,-13.1,0.038336,15.454545,...,0.000000e+00,0.000000,61.886219,0.547848,12.957053,0.223514,0.941818,431.754245,6.219382,0.000112
24,2002,AGO,安哥拉,9.937307,0.544827,196.984099,1471.605629,33387.651692,-1.0,0.030330,15.454545,...,0.000000e+00,0.000000,61.440995,0.548009,12.964688,0.223551,0.895717,421.049537,6.219382,0.000112
24,2003,AGO,安哥拉,-0.466061,0.533776,93.926567,1540.353428,31721.338480,-4.0,0.030678,15.454545,...,0.000000e+00,0.000000,60.995770,0.547970,12.947625,0.226451,0.924988,418.034543,6.219382,0.000112
24,2004,AGO,安哥拉,7.127077,0.572137,33.443595,1634.750461,34504.159974,2.9,0.054326,15.000000,...,0.000000e+00,0.000000,60.550546,0.549276,12.916447,0.230279,0.929811,418.916453,6.219382,0.000112
24,2005,AGO,安哥拉,11.009566,0.652107,42.374249,1565.000623,41292.012964,13.9,0.012589,15.000000,...,0.000000e+00,0.000000,60.105322,0.549837,12.911064,0.230565,0.812875,406.639624,6.219382,0.000112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,2016,ZMB,赞比亚,0.561271,0.304031,13.552485,360.610819,11931.994974,-3.3,0.047582,17.964072,...,5.605996e-08,0.000011,61.295955,3.973007,1.136406,0.000620,0.316995,272.061050,17.000000,14.859503
894,2017,ZMB,赞比亚,0.331284,0.309195,10.095730,390.497616,12234.074948,-1.7,0.042079,17.964072,...,5.434137e-08,0.000010,61.042885,3.971147,1.132151,0.000593,0.393726,263.086573,17.000000,14.859503
894,2018,ZMB,赞比亚,0.897347,0.343348,7.411571,302.050290,12506.683873,-1.3,0.038615,17.964072,...,6.862566e-07,0.000010,60.789707,3.970154,1.131198,0.000571,0.440527,254.626303,17.000000,15.362832
894,2019,ZMB,赞比亚,-1.564237,0.301991,7.633470,316.674603,11755.633174,0.4,0.041383,17.964072,...,5.222933e-05,0.000010,60.536515,3.976468,1.132742,0.000594,0.414336,246.125887,17.000000,15.362832


In [20]:
data1.loc[:, data1.isnull().any()].isnull().sum().sort_values()

Energy consumption per capita (million Btu per person)                                                                                                                                          51
Total area of all Marine Protected Areas in a country(% of land area)                                                                                                                           60
CO2 emissions (metric tons per capita)_x                                                                                                                                                        60
Life expectancy at birth, total (years)_pct_change                                                                                                                                              60
Population growth (annual %)                                                                                                                                                                    60
Scientific and technical 

In [21]:
missing_by_country = data1.groupby('Numeric').apply(lambda x: x.isnull().sum())
selected_countries = missing_by_country.loc[(missing_by_country < 10).all(axis=1)].index
data1 = data1.query("Numeric in @selected_countries")

print(len(data1.index.get_level_values(0).unique()))

data1.loc[:, data1.isnull().any()].isnull().sum().sort_values()


data_filled = (
    data1.groupby("Numeric")
    .apply(interpolate_with_linear_regression)
    .droplevel(0)
)
data_filled.loc[:, data_filled.isnull().any()].isnull().sum().sort_values()

37


Series([], dtype: float64)

In [22]:
from sklearn.preprocessing import minmax_scale, robust_scale, scale

data_filled["Inflation, GDP deflator (annual %)"] = winsorize(
    data_filled["Inflation, GDP deflator (annual %)"], limits=[0.01, 0.01]
)
data_filled["Inflation, GDP deflator (annual %)"] = data_filled[
    "Inflation, GDP deflator (annual %)"
].abs()

# data_filled["External debt stocks (% of GNI)_x"] = winsorize(data_filled["External debt stocks (% of GNI)_x"], limits=[0, 0.01])
fea_posi = data_filled[
    np.intersect1d(
        data_filled.columns, VARIABLES_POST_CV_VIF.query('类型 == "正向"')["Variables"]
    )
]
fea_nega = data_filled[
    np.intersect1d(
        data_filled.columns, VARIABLES_POST_CV_VIF.query('类型 == "负向"')["Variables"]
    )
]
fea_posi.loc[:] = minmax_scale(fea_posi)
fea_nega.loc[:] = minmax_scale(-fea_nega)
scaled_data = (
    pd.merge(
        fea_posi,
        fea_nega,
        how="outer",
        left_index=True,
        right_index=True,
    ).query("Year>=@years[0]")
    .reset_index()
    .merge(
        country[["Numeric", "CountryName_CN"]], on="Numeric"
    )
    .set_index(["CountryName_CN", "Numeric", "Year"])
)

scaled_data.to_excel(route / "scaled_data.xlsx", index=True)
# Check whether all the data had been filled
null2 = scaled_data[scaled_data.columns[scaled_data.isnull().any()]].isnull().sum()
null2.sort_values()

Series([], dtype: float64)

In [23]:
def calculate_critic(current_year, scaled_data):
    d = (1 - scaled_data.corr().abs()).sum() * scaled_data.std()
    criticweight = d / d.sum()

    df_critic = (
        pd.DataFrame((scaled_data * criticweight * 100).sum(axis=1), columns=["index"])
        .reset_index()
        .sort_values(by=["Year", "index"], ascending=False)
    )  # [["Year", "CountryName_CN"]]
    df_critic["rank"] = (
        df_critic.groupby("Year")["index"].rank(ascending=False).astype("int64")
    )
    df_critic_current_year = (
        df_critic.query("Year == @current_year")[["CountryName_CN", "index", "rank"]]
        .set_index("CountryName_CN")
        .rename(columns={"index": "CRITIC", "rank": "CRITIC rank"})
    )
    return df_critic_current_year


# def calculate_entropy_weight(scaled_data):
#     m = scaled_data.shape[0]
#     pij = scaled_data / scaled_data.sum(axis=0)
#     test = np.nan_to_num(pij * np.log(pij))
#     ej = -1 / np.log(m) * test.sum(axis=0)
#     wi_entropy = (1 - ej) / np.sum(1 - ej)
#     df_entropy = (
#         pd.DataFrame((scaled_data * wi_entropy * 100).sum(axis=1), columns=["index"])
#         .reset_index()
#         .sort_values(by=["Year", "index"], ascending=False)
#     )
#     df_entropy["rank"] = (
#         df_entropy.groupby("Year")["index"].rank(ascending=False).astype("int64")
#     )
#     df_entropy_current_year = (
#         df_entropy.query("Year == @current_year")[["CountryName_CN", "index", "rank"]]
#         .set_index("CountryName_CN")
#         .rename(columns={"index": "Entropy", "rank": "Entropy rank"})
#     )
#     return df_entropy_current_year


current_year = years[-1]
index_counts = VARIABLES_POST_CV_VIF.groupby("一级指标")["Variables"].count()
level_1_index_count = len(VARIABLES_POST_CV_VIF["一级指标"].unique())
VARIABLES_POST_CV_VIF["weight"] = VARIABLES_POST_CV_VIF.apply(
    lambda row: 1 / index_counts[row["一级指标"]] / level_1_index_count, axis=1
)

equalweight = VARIABLES_POST_CV_VIF.set_index("Variables")["weight"]
df_equal = (
    pd.DataFrame((scaled_data * equalweight * 100).sum(axis=1), columns=["index"])
    .reset_index()
    .sort_values(by=["Year", "index"], ascending=False)
)  # [["Year", "CountryName_CN"]]
df_equal["rank"] = (
    df_equal.groupby("Year")["index"].rank(ascending=False).astype("int64")
)
df_equal = (
    df_equal.query("Year == @current_year")[["CountryName_CN", "index", "rank"]]
    .set_index("CountryName_CN")
    .rename(columns={"index": "Equal", "rank": "Equal rank"})
)
###averageweight
averageweight = 1 / len(scaled_data.columns)

df_averageweight = (
    pd.DataFrame((scaled_data * averageweight * 100).sum(axis=1), columns=["index"])
    .reset_index()
    .sort_values(by=["Year", "index"], ascending=False)
)
df_averageweight["rank"] = (
    df_averageweight.groupby("Year")["index"].rank(ascending=False).astype("int64")
)
df_average = (
    df_averageweight.query("Year == @current_year")[["CountryName_CN", "index", "rank"]]
    .set_index("CountryName_CN")
    .rename(columns={"index": "Average", "rank": "Average rank"})
)
###critic weight
d = (1 - scaled_data.corr().abs()).sum() * scaled_data.std()
criticweight = d / d.sum()

df_critic = (
    pd.DataFrame((scaled_data * criticweight * 100).sum(axis=1), columns=["index"])
    .reset_index()
    .sort_values(by=["Year", "index"], ascending=False)
)  # [["Year", "CountryName_CN"]]
df_critic["rank"] = (
    df_critic.groupby("Year")["index"].rank(ascending=False).astype("int64")
)
df_critic_current_year = (
    df_critic.query("Year == @current_year")[["CountryName_CN", "index", "rank"]]
    .set_index("CountryName_CN")
    .rename(columns={"index": "CRITIC", "rank": "CRITIC rank"})
)
(
    calculate_critic(current_year, scaled_data)
    .merge(df_equal, left_index=True, right_index=True)
    .merge(df_average, left_index=True, right_index=True)
    # .merge(calculate_critic(current_year, robust_scaled_data), left_index=True, right_index=True)
    # .merge(calculate_entropy_weight(scaled_data), left_index=True, right_index=True)
).round(2).to_excel(route / "sensitivity.xlsx", index=True)

In [24]:
variables_en_to_cn = {
    record["Variables"]: record["三级指标"]
    for record in VARIABLES.query("变量类型 == '指标体系'")[["三级指标", "Variables"]].to_dict(
        "records"
    )
}

In [25]:
", ".join("`" + VARIABLES.query("变量类型 == '指标体系'")["Variables"] + "`")

'`GDP per capita growth (annual %)_x`, `Merchandise exports (% of GDP)`, `Final consumption expenditure (% of GDP)`, `Inflation, GDP deflator (annual %)`, `Agriculture, forestry, and fishing, value added per worker (constant 2015 US$)_x`, `Industry (including construction), value added per worker (constant 2015 US$)_x`, `Services, value added per worker (constant 2015 US$)_x`, `Current account balance, percent of GDP (Percent of GDP)(IMF)`, `Net official development assistance and official aid received (current US$)(% of GNI)`, `Employment to population ratio, 15+, total (%) (modeled ILO estimate)`, `gini`, `Proportion of seats held by women in national parliaments (%)_x`, `Prevalence of undernourishment (percent) (3-year average)`, `Life expectancy at birth, total (years)_pct_change`, `Labor force(% of total population)`, `Population growth (annual %)`, `Mean years schooling`, `Scientific and technical journal articles`, `Prevalence of HIV, total (% of population ages 15-49)_x`, `Dome

In [26]:
weight = criticweight.to_frame()
weight.index = [variables_en_to_cn[i] for i in weight.index]
(
    weight.merge(
        VARIABLES.query("变量类型 == '指标体系'"), how="right", right_on="三级指标", left_index=True
    )[[*"一级指标	二级指标	三级指标 类型".split(), 0]]
    .dropna()
    .set_index("一级指标	二级指标	三级指标 类型".split())
    .round(4)
    .to_excel(route / "critic_weight.xlsx")
)

In [27]:
df = (
    calculate_critic(current_year, scaled_data)
    .merge(df_equal, left_index=True, right_index=True)
    .merge(df_average, left_index=True, right_index=True)
    # .merge(calculate_critic(current_year, robust_scaled_data), left_index=True, right_index=True)
    # .merge(calculate_entropy_weight(scaled_data), left_index=True, right_index=True)
)
(df["Average"].mean() - 54.18) / 54.18

-0.12297200505070262

In [28]:
d = pd.DataFrame(
    (
        df_critic.query("Year==@years[0] | Year==@years[-1]")
        .merge(df_final[["Region"]], on=["Numeric", "Year"], how="left")
        .set_index(["CountryName_CN", "Region"])
        .groupby(["Numeric"])["index"]
        .diff(-1)
        .sort_values(ascending=False)
        .dropna()
    )
)
d

Unnamed: 0_level_0,Unnamed: 1_level_0,index
CountryName_CN,Region,Unnamed: 2_level_1
塞拉利昂,W,12.477397
南非,S,11.307903
埃塞俄比亚,E,10.425483
加蓬,C,8.915627
冈比亚,W,8.176882
安哥拉,S,7.976072
喀麦隆,C,7.840843
多哥,W,7.589068
卢旺达,E,7.508206
纳米比亚,S,6.811482


In [29]:
df_critic.pivot(index="CountryName_CN", columns="Year", values="index").sort_values(
    years[19], ascending=False
).round(1).to_excel(route / "critic_index.xlsx")

In [30]:
df_merge = (
    df_critic_current_year.merge(df_equal, left_index=True, right_index=True)
    .merge(df_average, left_index=True, right_index=True)
    .round(2)
)

df_merge["dif1"] = df_merge["Equal"] - df_merge["CRITIC"]
df_merge["dif2"] = df_merge["Average"] - df_merge["CRITIC"]
df_merge.sort_values("dif2", ascending=False)

print(
    f'percent change between equal and CRITIC {(df_merge["Equal"].mean() - df_merge["CRITIC"].mean()) / df_merge["CRITIC"].mean():%}'
)

percent change between equal and CRITIC -4.676747%


###计算障碍因子
tmp = criticweight * (1 - scaled_data)

tmp["sum"] = tmp.sum(axis=1)
for col in tmp:
    tmp[col] /= tmp["sum"]
obstacle = tmp.drop("sum", axis=1) * 100

obstacle = (
        obstacle.reset_index()
        .melt(id_vars=["Alpha-3 code", "CountryName_CN", "Numeric", "Year", "Region"])
        .rename(columns={"variable": "Variables"})
        .merge(VARIABLES[["Variables", "Variablecode"]], how="left", on=["Variables"])
    .groupby(["Numeric", "Year"])
        .apply(lambda group: group.sort_values(by=["value"], ascending=False).head(3))
    .reset_index(drop=True)
    .drop(columns=["Alpha-3 code", "Numeric", "Variables"])
    .query("Year==2005 | Year==2010 | Year==2015 | Year ==2019")
)

次数 = (
    pd.DataFrame(obstacle.groupby(["Year"])[["Variablecode"]].value_counts())
    .reset_index()
    .merge(VARIABLES[["Variables", "Variablecode", "三级指标简写"]], how="left", on=["Variablecode"])
)

#obstacle.query('Year==2005 & Region=="C"')
次数2 = (
    pd.DataFrame(obstacle.groupby(["Region"])[["Variablecode"]].value_counts())
    .reset_index()
    .merge(VARIABLES[["Variables", "Variablecode", "三级指标简写"]], how="left", on=["Variablecode"])
)
次数2.query('Region=="C"')#[["Variablecode"]].value_counts().sum()

"、".join(次数2.query('Region=="C"')["三级指标简写"])
次数3 = 次数2.query('Region=="C"')
">".join(次数3["Variablecode"] + "（" + 次数3[0].astype(str) + "次）")

import docx
# open an existing document
doc = docx.Document()

# add a table to the end and create a reference variable
# extra row is so we can add the header row
number_of_countries = len(obstacle["CountryName_CN"].unique())
number_of_years = len(obstacle["Year"].unique())

t = doc.add_table(number_of_countries * 2 + 1, number_of_years * 3 + 1)

# add the header rows.
for j, year in enumerate(sorted(obstacle.Year.unique())):
    #t.cell(0, j * 3 + 1).merge(t.cell(0, j * 3 + 2))
    #t.cell(0, j * 3 + 1).merge(t.cell(0, j * 3 + 3))
    t.cell(0, j * 3 + 1).text = f"{year:.0f}"

# add the rest of the data frame
for i, CountryName_CN in enumerate(sorted(obstacle.CountryName_CN.unique())):
    t.cell(i * 2 + 1, 0).merge(t.cell(i * 2 + 2, 0))
    t.cell(i * 2 + 1, 0).text = CountryName_CN
    for j, year in enumerate(sorted(obstacle.Year.unique())):
        k = 0
        for _, row in obstacle.query(f"CountryName_CN == '{CountryName_CN}' & Year == {year:.0f}").iterrows():
            t.cell(i * 2 + 1, j * 3 + 1 + k).text = row["Variablecode"]
            t.cell(i * 2 + 2, j * 3 + 1 + k).text = "{:.2f}".format(row["value"])
            k += 1

#save the doc

#doc.save(r'C:\Users\thinkpad\OneDrive\PhD Dissertation\Regression Results\obstacle.docx')

In [31]:
# 将各个一级指标的变量分开
variables_by_level1 = {
    level1: VARIABLES_POST_CV_VIF.query(f'一级指标 == "{level1}"')["Variables"].to_list()
    for level1 in "经济 社会 资源 生态".split()
}
# 一级指标的中英文对照
level1_cn2en = {"经济": "Economy", "社会": "Society", "资源": "Resource", "生态": "Ecology"}
index_equal = pd.concat(
    [
        scaled_data[variables_by_level1[level1]]
        .mean(axis=1)
        .rename("SDI_" + level1_cn2en[level1] + "_Equal")
        for level1, variables in variables_by_level1.items()
    ],
    axis=1,
)
index_equal["SDI_Equal"] = index_equal.mean(axis=1)
index_equal["SDI_Average"] = scaled_data[
    [v for variables in variables_by_level1.values() for v in variables]
].mean(axis=1)
index_data = pd.concat(
    [
        (
            (scaled_data * criticweight[variables_by_level1[level1]]).sum(axis=1)
            / criticweight[variables_by_level1[level1]].sum()
        ).rename("SDI_" + level1_cn2en[level1])
        for level1, variables in variables_by_level1.items()
    ],
    axis=1,
)
index_data["SDI"] = (scaled_data * criticweight).sum(axis=1)
index_data = index_data.merge(index_equal, left_index=True, right_index=True)
index_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SDI_Economy,SDI_Society,SDI_Resource,SDI_Ecology,SDI,SDI_Economy_Equal,SDI_Society_Equal,SDI_Resource_Equal,SDI_Ecology_Equal,SDI_Equal,SDI_Average
CountryName_CN,Numeric,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
安哥拉,24,2001,0.337159,0.331649,0.440433,0.432934,0.396390,0.340888,0.313745,0.447757,0.445980,0.387093,0.391517
安哥拉,24,2002,0.313340,0.333582,0.462697,0.435028,0.400819,0.341541,0.317270,0.471757,0.448665,0.394808,0.400710
安哥拉,24,2003,0.295762,0.360326,0.467275,0.434672,0.406192,0.314906,0.353754,0.476343,0.448245,0.398312,0.406110
安哥拉,24,2004,0.442058,0.354592,0.448782,0.433687,0.419866,0.444158,0.336629,0.455394,0.447220,0.420850,0.420739
安哥拉,24,2005,0.457676,0.352866,0.446005,0.436732,0.421775,0.463711,0.334710,0.451548,0.451306,0.425319,0.423881
...,...,...,...,...,...,...,...,...,...,...,...,...,...
赞比亚,894,2016,0.366628,0.377141,0.439554,0.705805,0.495396,0.376784,0.364731,0.451644,0.686402,0.469890,0.467982
赞比亚,894,2017,0.375555,0.387937,0.427135,0.706605,0.495844,0.384116,0.372603,0.438506,0.687029,0.470564,0.467487
赞比亚,894,2018,0.388113,0.395561,0.415889,0.709499,0.497025,0.393998,0.378848,0.426503,0.689952,0.472325,0.467965
赞比亚,894,2019,0.376020,0.409032,0.430283,0.711352,0.503541,0.383885,0.392349,0.443946,0.692204,0.478096,0.475411


In [32]:
index_data_filled0 = index_data.reset_index().merge(
    data_filled,
    how="outer",
    on=["Year", "Numeric", "CountryName_CN"],
)

dfgeo = pd.read_excel(
    Path.home()
    / "OneDrive"
    / "Rawdata"
    / "Country Classification"
    / "UN Classification_Natural resources_Geography.xlsx",
    sheet_name="Sheet1",
    na_values="..",
    usecols="D, E",
)

index_data_filled = index_data_filled0.merge(dfgeo, how="left", on="Numeric").set_index(
    ["Alpha-3 code", "CountryName_CN", "Numeric", "Year"]
)  # .query("Year==2020").sort_values(["index"],ascending=False)

index_data_filled.to_excel(route / "index_data.xlsx", index=True)
index_data_filled.to_csv(route / "index_data.csv", index=True)

In [33]:
index_data_filled["SDI_Society"].describe()

count    740.000000
mean       0.438641
std        0.085332
min        0.267529
25%        0.382464
50%        0.422908
75%        0.480757
max        0.766186
Name: SDI_Society, dtype: float64

In [34]:
# for name, df in zip(("SDI", "SDI_Economy", "SDI_Society", "SDI_Resource", "SDI_Ecology"), df_index):
#     df.pivot(index="CountryName_CN", columns="Year", values=name).to_excel(pathlib.Path.home() / "Downloads" / f"{name}.xlsx")

In [35]:
d1 = index_data_filled[
    ["SDI", "SDI_Economy", "SDI_Society", "SDI_Resource", "SDI_Ecology"]
]
d1.groupby("Year").mean().round(2)

Unnamed: 0_level_0,SDI,SDI_Economy,SDI_Society,SDI_Resource,SDI_Ecology
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001,0.43,0.38,0.38,0.5,0.44
2002,0.44,0.38,0.39,0.5,0.44
2003,0.44,0.38,0.4,0.49,0.45
2004,0.44,0.39,0.4,0.5,0.45
2005,0.45,0.39,0.4,0.5,0.46
2006,0.45,0.39,0.41,0.49,0.46
2007,0.45,0.4,0.41,0.49,0.46
2008,0.45,0.39,0.42,0.5,0.47
2009,0.46,0.38,0.43,0.5,0.47
2010,0.46,0.39,0.43,0.5,0.47


In [36]:
##计算五个次区域的差距
dif_region = (
    index_data_filled[
        [
            "SDI",
            "SDI_Economy",
            "SDI_Society",
            "SDI_Resource",
            "SDI_Ecology",
            "Region",
        ]
    ]
    .groupby(["Region", "Year"])
    .mean()
    .query("Year==@years[0] | Year==@years[-1]")
    .groupby(["Region"])
    .diff()
)

dif_region.query("Year==@years[-1]").sort_values(by=["SDI_Resource"], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,SDI,SDI_Economy,SDI_Society,SDI_Resource,SDI_Ecology
Region,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E,2020,0.057488,-0.018042,0.126287,0.042391,0.052144
S,2020,0.059118,0.008709,0.15751,0.024923,0.036968
W,2020,0.061023,0.032643,0.122499,0.022356,0.062747
C,2020,0.050926,-0.004839,0.086649,0.002974,0.095334


In [37]:
##计算区域增长率
def index_region_change():
    for idx, row in (
        index_data_filled.groupby(["Region", "Year"])
        .mean()
        .reset_index()[
            [
                "Year",
                "SDI",
                "SDI_Economy",
                "SDI_Society",
                "SDI_Resource",
                "SDI_Ecology",
                "Region",
            ]
        ]
        .query("Year == @years[0] | Year == @years[-1]")
        .groupby(["Region"])
    ):
        t = tuple(row["SDI"])
        name = row.iloc[0].Region
        yield {"Region": name, "change": ((t[1] / t[0]) ** (1 / 15) - 1) * 100}


index_region_change = pd.DataFrame(index_region_change()).sort_values(
    by=["change"], ascending=False
)
index_region_change

Unnamed: 0,Region,change
3,W,0.892434
1,E,0.823447
2,S,0.822173
0,C,0.780589


In [38]:
index_data_filled[
    ["SDI", "SDI_Economy", "SDI_Society", "SDI_Resource", "SDI_Ecology", "Region"]
].query("Year==@years[-1]").groupby("Region").mean()

Unnamed: 0_level_0,SDI,SDI_Economy,SDI_Society,SDI_Resource,SDI_Ecology
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,0.46259,0.391615,0.451497,0.44393,0.523036
E,0.496669,0.331083,0.48623,0.547722,0.532212
S,0.511498,0.42504,0.542213,0.528702,0.510197
W,0.489078,0.383366,0.512014,0.532102,0.477646


# 计算各分区中有多少个国家
index_binning_df = pd.DataFrame()
for index in ["SDI", "SDI_Economy", "SDI_Society", "SDI_Environment", "SDI_Resource"]:
    for year in (2005, 2019):
        index_binning_df[f"{index}_{year}"] = index_data_filled.query(
            f"Year == {year}"
        )[index].value_counts(bins=[0, 0.45, 0.55, 0.65, 0.75, 1])
c = index_binning_df.T.rename(
    columns={
        pd.Interval(0.45, 0.55, closed="right"): "较低水平",
        pd.Interval(0.55, 0.65, closed="right"): "中等水平",
        pd.Interval(-0.001, 0.45, closed="right"): "低水平",
        pd.Interval(0.65, 0.75, closed="right"): "较高水平",
        pd.Interval(0.75, 1.0, closed="right"): "高水平",
    }
)[["低水平", "较低水平", "中等水平", "较高水平", "高水平"]]
c

In [39]:
# 计算国家变化率
def index_country_change(index):
    for idx, row in (
        index_data_filled.groupby(["CountryName_CN", "Year", "Region"])
        .mean()
        .reset_index()[
            [
                "Year",
                index,
                "CountryName_CN",
                "Region",
            ]
        ]
        .query("Year == @years[0] | Year == @years[-1]")
        .groupby(["CountryName_CN"])
    ):
        t = tuple(row[index])
        name = row.iloc[0].CountryName_CN
        region = row.iloc[0].Region
        yield {
            "CountryName_CN": name,
            "Region": region,
            "change": ((t[1] / t[0]) ** (1 / 19) - 1) * 100,
        }


index_country_change = pd.DataFrame(index_country_change("SDI_Ecology")).sort_values(
    by=["change"], ascending=False
)

index_country_change.query("change<1 & change>0").value_counts().sum()

23

In [40]:
index_data_filled.query("Year==2020").sort_values("SDI_Resource")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SDI_Economy,SDI_Society,SDI_Resource,SDI_Ecology,SDI,SDI_Economy_Equal,SDI_Society_Equal,SDI_Resource_Equal,SDI_Ecology_Equal,SDI_Equal,...,Biomass and waste electricity net generation (million metric tons of oil equivalent)_per_capita,Forest area (% of land area)_x,Wetland area（% of land area),Grassland area（% of land area),"Terrestrial barren land|1000 HA|ECCCT|Terrestrial Barren Land|Environment, Climate Change, Climate Indicators, Land Cover Accounts, Terrestrial Barren Land|Climate neutral(% of land area)",CO2 emissions (metric tons per capita)_x,Lead exposure,Terrestrial biome protection (global weights),Total area of all Marine Protected Areas in a country(% of land area),Region
Alpha-3 code,CountryName_CN,Numeric,Year,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
COD,刚果（金）,180,2020,0.380921,0.386403,0.370369,0.551908,0.431592,0.387777,0.354373,0.392844,0.544364,0.41984,...,6.428482e-07,55.647313,2.190472,1.433078,0.001679,0.032585,325.602481,13.558196,0.0,C
LBR,利比里亚,430,2020,0.365857,0.433726,0.40058,0.484499,0.429573,0.374949,0.405087,0.420204,0.490293,0.422633,...,0.0,79.084718,0.030604,0.005606,0.0,0.231839,315.542896,9.476578,0.260838,W
MLI,马里,466,2020,0.397958,0.518995,0.418593,0.353775,0.420586,0.398905,0.468933,0.426932,0.377081,0.417963,...,6.249778e-06,10.896664,0.255045,11.750971,5.641164,0.195566,375.077274,10.539296,15.469962,W
ZWE,津巴布韦,716,2020,0.176219,0.431628,0.423101,0.565539,0.433541,0.220841,0.41291,0.424224,0.547409,0.401346,...,1.396744e-05,45.093912,0.067525,3.339239,0.023361,0.530484,381.039487,16.839024,15.469962,S
CAF,中非,140,2020,0.349233,0.331801,0.4249,0.513139,0.418095,0.382041,0.310967,0.458295,0.493927,0.411307,...,0.0,35.800507,0.03173,0.010931,0.000121,0.044282,529.244977,16.494695,15.469962,C
ZMB,赞比亚,894,2020,0.402119,0.413212,0.439281,0.71305,0.511558,0.404665,0.390483,0.453654,0.694196,0.485749,...,9.146031e-06,60.283337,3.977095,1.132581,0.000586,0.401903,238.986269,17.0,15.469962,S
GAB,加蓬,266,2020,0.492081,0.607328,0.443835,0.670596,0.560853,0.473754,0.561742,0.479752,0.649433,0.54117,...,1.060745e-05,91.320681,0.067127,1.436875,0.0,2.333274,146.069702,16.190271,20.185621,C
GNB,几内亚（比绍）,624,2020,0.363638,0.405276,0.448534,0.648988,0.487199,0.374318,0.374833,0.453644,0.630186,0.458245,...,0.0,70.412873,0.127023,0.22145,0.009536,0.163208,451.336567,17.0,32.335603,W
TCD,乍得,148,2020,0.359766,0.375049,0.467599,0.377318,0.401503,0.369974,0.344293,0.473141,0.3887,0.394027,...,1.328206e-06,3.425191,0.557403,10.048565,5.003641,0.094228,435.031019,13.156876,15.469962,C
COG,刚果（布）,178,2020,0.426852,0.434851,0.475231,0.564082,0.485578,0.4191,0.41539,0.498969,0.544564,0.469506,...,0.0,64.263543,0.249242,2.127858,0.000127,1.254592,212.011909,16.711669,0.0,C
