In [109]:
import pandas as pd
import requests
from io import StringIO
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.graph_objects as go

# 1. Data Collection

# pd.read_csv() cannot read data directly from a URL, 
# so I use the requests library to download the data
url_Global_Electricity_Statistics = 'https://github.com/xiaothua/dataset/raw/main/Global%20Electricity%20Statistics.csv'
response = requests.get(url_Global_Electricity_Statistics)
csv_GES = StringIO(response.text)
data_GES = pd.read_csv(csv_GES)

url_GDP = 'https://github.com/xiaothua/dataset/raw/main/gdp.csv'
response = requests.get(url_GDP)
csv_GDP = StringIO(response.text)
data_GDP = pd.read_csv(csv_GDP)

url_GDP_growth = 'https://github.com/xiaothua/dataset/raw/main/gdp_growth.csv'
response = requests.get(url_GDP_growth)
csv_GDP_growth = StringIO(response.text)
data_GDP_growth = pd.read_csv(csv_GDP_growth)

url_GDP_per_capita = 'https://github.com/xiaothua/dataset/raw/main/gdp_per_capita.csv'
response = requests.get(url_GDP_per_capita)
csv_GDP_per_capita = StringIO(response.text)
data_GDP_per_capita = pd.read_csv(csv_GDP_per_capita)

url_GDP_per_capita_growth = 'https://github.com/xiaothua/dataset/raw/main/gdp_per_capita_growth.csv'
response = requests.get(url_GDP_per_capita_growth)
csv_GDP_per_capita_growth = StringIO(response.text)
data_GDP_per_capita_growth = pd.read_csv(csv_GDP_per_capita_growth)

url_PPP = 'https://github.com/xiaothua/dataset/raw/main/gdp_ppp.csv'
response = requests.get(url_PPP)
csv_PPP = StringIO(response.text)
data_PPP = pd.read_csv(csv_PPP)

url_PPP_per_capita = 'https://github.com/xiaothua/dataset/raw/main/gdp_ppp_per_capita.csv'
response = requests.get(url_PPP_per_capita)
csv_PPP_per_capita = StringIO(response.text)
data_PPP_per_capita = pd.read_csv(csv_PPP_per_capita)

In [110]:
# 2. Data Cleaning and Classification

# electricity data
# Delete the columns from 1980 to 1991. Because too many new countries were added after 1991.
data_GES_cleaned = data_GES.drop(columns=[str(year) for year in range(1980, 1992)])

# Filter out the data with "ie", "NA", "--", null in the "Features" column
data_GES_cleaned.replace(['ie', 'NA', '--'], pd.NA, inplace=True)  # Replace 'ie' with NA
data_GES_cleaned = data_GES_cleaned.dropna()  # Drop rows with NA

# Check the number of unique Features for each country
features_count = data_GES_cleaned.groupby('Country')['Features'].nunique()

# Find the countries with 7 unique Features
valid_countries = features_count[features_count == 7].index
data_GES_cleaned = data_GES_cleaned[data_GES_cleaned['Country'].isin(valid_countries)]

# Remove leading and trailing whitespaces
data_GES_cleaned['Country'] = data_GES_cleaned['Country'].str.strip()
data_GES_cleaned['Features'] = data_GES_cleaned['Features'].str.strip()
data_GES_cleaned['Region'] = data_GES_cleaned['Region'].str.strip()

# Convert the data type of the columns from 1992 to 2018 to numeric
years = [str(year) for year in range(1992, 2022)]
data_GES_cleaned[years] = data_GES_cleaned[years].apply(pd.to_numeric, errors='coerce')

# GDP data
# Change the country names in the GDP data to match the country names in the electricity data
country_name_corrections = {
    'Bahamas, The': 'The Bahamas',
    'Turkey': 'Turkiye',
    'Congo, Dem. Rep.': 'Congo-Kinshasa',
    'Congo, Rep.': 'Congo-Brazzaville',
    'St. Vincent and the Grenadines': 'Saint Vincent/Grenadines',
    'St. Lucia': 'Saint Lucia',
    'Virgin Islands (U.S.)': 'U.S. Virgin Islands',
    'St. Kitts and Nevis': 'Saint Kitts and Nevis',
    'Egypt, Arab Rep.': 'Egypt',
    'Hong Kong SAR, China': 'Hong Kong',
    'Russian Federation': 'Russia',
    'Sub-Saharan Africa': 'Western Sahara',
    'Korea, Rep.': 'South Korea',
    'Iran, Islamic Rep.': 'Iran',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'Syrian Arab Republic': 'Syria',
    'Yemen, Rep.': 'Yemen',
    'Brunei Darussalam': 'Brunei',
    'Macao SAR, China': 'Macau',
    'Lao PDR': 'Laos',
    'Pacific island small states': 'U.S. Pacific Islands',
    'Myanmar': 'Burma',
    'Venezuela, RB': 'Venezuela'
}

data_GDP['Country Name'] = data_GDP['Country Name'].replace(country_name_corrections)
data_GDP_growth['Country Name'] = data_GDP_growth['Country Name'].replace(country_name_corrections)
data_GDP_per_capita['Country Name'] = data_GDP_per_capita['Country Name'].replace(country_name_corrections)
data_GDP_per_capita_growth['Country Name'] = data_GDP_per_capita_growth['Country Name'].replace(country_name_corrections)
data_PPP['Country Name'] = data_PPP['Country Name'].replace(country_name_corrections)
data_PPP_per_capita['Country Name'] = data_PPP_per_capita['Country Name'].replace(country_name_corrections)

In [111]:
# 3. Variable Identification and Preprocessing

# set multi-index
data_total = data_GES_cleaned.set_index(['Country', 'Features'])

# # 假设 data_total 已经设置了 MultiIndex，且 `data_gdp_per_capita_worldbank` 包含如下列：
# # 'Entity' 对应国家名, 'Year' 对应年份, 'GDP per capita, PPP (constant 2017 international $)' 对应人均GDP数值

# # 1. 先从 data_gdp_per_capita_worldbank 中提取出相关的数据
# gdp_data = data_GDP[['Entity', 'Year', 'GDP per capita (output, multiple price benchmarks)']]

# # 2. 遍历每个国家和年份，逐步将 gdp per capita 数据添加到 data_total 中
# for country in data_total.index.get_level_values('Country').unique():
#     for year in range(1992, 2022):
#         # 从 gdp 数据中获取对应的国家和年份的人均GDP值
#         gdp_value = gdp_data[
#             (gdp_data['Entity'] == country) & (gdp_data['Year'] == year)
#         ]['GDP per capita (output, multiple price benchmarks)'].values
        
#         # 如果有对应的数据，添加到 data_total 中
#         if len(gdp_value) > 0:
#             # 在 'gdp per capita' 这个 feature 中插入值
#             data_total.loc[(country, 'gdp per capita'), str(year)] = gdp_value[0]
#             # 将 'Region' 列设置为与 'net generation' 相同的内容
#             region_value = data_total.loc[(country, 'net generation'), 'Region']
#             data_total.loc[(country, 'gdp per capita'), 'Region'] = region_value

# 假设 data_gdp 包含如下列：'Country Name' 对应国家名，年份列为1992到2021

# add gdp data to data_total
gdp_data = data_GDP.set_index('Country Name')

for country in data_total.index.get_level_values('Country').unique():
    for year in range(1992, 2022):

        try:
            gdp_value = gdp_data.loc[country, str(year)]
        except KeyError:
            gdp_value = None
        
        # if gdp_value is not None, add to data_total
        if gdp_value is not None:
    
            data_total.loc[(country, 'gdp'), str(year)] = gdp_value
            
            region_value = data_total.loc[(country, 'net generation'), 'Region'] # get region value from 'net generation'
            data_total.loc[(country, 'gdp'), 'Region'] = region_value

# add gdp growth data to data_total
gdp_growth_data = data_GDP_growth.set_index('Country Name')

for country in data_total.index.get_level_values('Country').unique():
    for year in range(1992, 2022):

        try:
            gdp_growth_value = gdp_growth_data.loc[country, str(year)]
        except KeyError:
            gdp_growth_value = None
        
        # if gdp_growth_value is not None, add to data_total
        if gdp_growth_value is not None:
    
            data_total.loc[(country, 'gdp growth'), str(year)] = gdp_growth_value
            
            region_value = data_total.loc[(country, 'net generation'), 'Region'] # get region value from 'net generation'
            data_total.loc[(country, 'gdp growth'), 'Region'] = region_value

# add gdp per capita data to data_total
gdp_per_capita_data = data_GDP_per_capita.set_index('Country Name')

for country in data_total.index.get_level_values('Country').unique():
    for year in range(1992, 2022):

        try:
            gdp_per_capita_value = gdp_per_capita_data.loc[country, str(year)]
        except KeyError:
            gdp_per_capita_value = None
        
        # if gdp_per_capita_value is not None, add to data_total
        if gdp_per_capita_value is not None:
    
            data_total.loc[(country, 'gdp per capita'), str(year)] = gdp_per_capita_value
            
            region_value = data_total.loc[(country, 'net generation'), 'Region'] # get region value from 'net generation'
            data_total.loc[(country, 'gdp per capita'), 'Region'] = region_value

# add gdp per capita growth data to data_total
gdp_per_capita_growth_data = data_GDP_per_capita_growth.set_index('Country Name')

for country in data_total.index.get_level_values('Country').unique():
    for year in range(1992, 2022):

        try:
            gdp_per_capita_growth_value = gdp_per_capita_growth_data.loc[country, str(year)]
        except KeyError:
            gdp_per_capita_growth_value = None
        
        # if gdp_per_capita_growth_value is not None, add to data_total
        if gdp_per_capita_growth_value is not None:
    
            data_total.loc[(country, 'gdp per capita growth'), str(year)] = gdp_per_capita_growth_value
            
            region_value = data_total.loc[(country, 'net generation'), 'Region'] # get region value from 'net generation'
            data_total.loc[(country, 'gdp per capita growth'), 'Region'] = region_value

# add gdp ppp data to data_total
gdp_ppp_data = data_PPP.set_index('Country Name')

for country in data_total.index.get_level_values('Country').unique():
    for year in range(1992, 2022):

        try:
            gdp_ppp_value = gdp_ppp_data.loc[country, str(year)]
        except KeyError:
            gdp_ppp_value = None
        
        # if gdp_ppp_value is not None, add to data_total
        if gdp_ppp_value is not None:
    
            data_total.loc[(country, 'gdp ppp'), str(year)] = gdp_ppp_value
            
            region_value = data_total.loc[(country, 'net generation'), 'Region'] # get region value from 'net generation'
            data_total.loc[(country, 'gdp ppp'), 'Region'] = region_value

# add gdp ppp per capita data to data_total
gdp_ppp_per_capita_data = data_PPP_per_capita.set_index('Country Name')

for country in data_total.index.get_level_values('Country').unique():
    for year in range(1992, 2022):

        try:
            gdp_ppp_per_capita_value = gdp_ppp_per_capita_data.loc[country, str(year)]
        except KeyError:
            gdp_ppp_per_capita_value = None
        
        # if gdp_ppp_per_capita_value is not None, add to data_total
        if gdp_ppp_per_capita_value is not None:
    
            data_total.loc[(country, 'gdp ppp per capita'), str(year)] = gdp_ppp_per_capita_value
            
            region_value = data_total.loc[(country, 'net generation'), 'Region'] # get region value from 'net generation'
            data_total.loc[(country, 'gdp ppp per capita'), 'Region'] = region_value

In [112]:
# keep the data with 13 features
features_count = data_total.index.get_level_values('Country').value_counts()
complete_countries = features_count[features_count == 13].index
data_total = data_total.loc[complete_countries]

# 1. 计算每个国家每个 feature 缺失值数量
missing_count = data_total.isnull().sum(axis=1)
countries_to_drop = missing_count[missing_count > 18].index.get_level_values('Country').unique()
data_total = data_total[~data_total.index.get_level_values('Country').isin(countries_to_drop)]

incomplete_countries = features_count[features_count != 13]
print(incomplete_countries)

data_total.info()

Country
Cook Islands                 7
Taiwan                       7
Montserrat                   7
Saint Pierre and Miquelon    7
Saint Helena                 7
Falkland Islands             7
Wake Island                  7
Niue                         7
Netherlands Antilles         7
North Korea                  7
Antarctica                   7
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2340 entries, ('Algeria', 'net generation') to ('Uzbekistan', 'gdp ppp per capita')
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Region  2340 non-null   object 
 1   1992    2230 non-null   float64
 2   1993    2239 non-null   float64
 3   1994    2249 non-null   float64
 4   1995    2285 non-null   float64
 5   1996    2304 non-null   float64
 6   1997    2306 non-null   float64
 7   1998    2308 non-null   float64
 8   1999    2310 non-null   float64
 9   2000    2318 non-null   float64
 10  2001    2