<a id="top"></a>
<div class="list-group" id="list-tab" role="tablist">
    <h3 style="text-align: center; background-color:rgb(18, 135, 198); font-family:newtimeroman; color: black; padding: 14px; line-height: 1; border-radius:10px"><b>Table of Contents </b></h3>
    
- I. Crawling Data and Merging Datasets
- II. Exploring and Pre-processing Data
    - [2.1. Basic infomation](#2.1)
        - [2.1.1. Shape of data](#2.1.1)
        - [2.1.2. Meaning of each row](#2.1.2)
        - [2.1.3. Meaning of each column](#2.1.3)
        - [2.1.4. Check duplicate](#2.1.4)
        - [2.1.5. Data type of each column (standardize if necessary)](#2.1.5)
        - [2.1.6. Missing values of each column](#2.1.6)
    - [2.2 Data distribution](#2.2)
        - [2.2.1. Numerical columns](#2.2.1)
        - [2.2.2. Categorical columns](#2.2.2)
    - [2.3. Abnormal values and outliers](#2.3)
    - [2.4. Correlation between variables](#2.4)
- III. Asking meaningful questions
    - [3.1. Question 1](#3.1)
        - [3.1.1. Preprocessing](#3.1.1)
        - [3.1.2. Analysis](#3.1.2)
        - [3.1.3. Conclusion](#3.1.3)
    - [3.2. Question 2](#3.2)
        - [3.2.1. Preprocessing](#3.2.1)
        - [3.2.2. Analysis](#3.2.2)
        - [3.2.3. Conclusion](#3.2.3)
    - [3.3. Question 3](#3.3)
        - [3.3.1. Preprocessing](#3.3.1)
        - [3.3.2. Analysis](#3.3.2)
        - [3.3.3. Conclusion](#3.3.3)

    - [3.4. Question 4](#3.4)
        - [3.4.1. Preprocessing](#3.4.1)
        - [3.4.2. Analysis](#3.4.2)
        - [3.4.3. Conclusion](#3.4.3)
    - [3.5. Question 5](#3.5)
        - [3.5.1. Preprocessing](#3.5.1)
        - [3.5.2. Analysis](#3.5.2)
        - [3.5.3. Conclusion](#3.5.3)
    - [3.6. Question 5](#3.6)
        - [3.6.1. Preprocessing](#3.6.1)
        - [3.6.2. Analysis](#3.6.2)
        - [3.6.3. Conclusion](#3.6.3)
- IV. References

In [35]:
from IPython.display import display
import requests
import numpy as np
import pandas as pd
from typing import List
import os

import urllib.robotparser

<a id="top"></a>
<div class="list-group" id="list-tab" role="tablist">
    <h3 style="text-align: center; background-color:rgb(2, 100, 153); font-family:newtimeroman; color: black; padding: 14px; line-height: 1; border-radius:10px"><b>1. Crawling Data and Merging Datasets </b></h3>

<a class="anchor" id="1.1."></a>
<h4 style="text-align: left; background-color:rgb(36, 138, 193); font-family:newtimeroman; color: black; padding: 14px; line-height: 1; border-radius:10px"><b>1.1. Helper functions </b></h4>

In [36]:
BASE_URL = 'http://api.worldbank.org/v2/'


INDICATOR_CODES = ['SP.POP.TOTL',
                   'SP.POP.TOTL.FE.IN',
                   'SP.POP.TOTL.MA.IN',
                   'SP.DYN.CBRT.IN',
                   'SP.DYN.CDRT.IN',
                   'SE.COM.DURS',
                   'SL.IND.EMPL.ZS',
                   'SL.AGR.EMPL.ZS',
                   'SL.AGR.EMPL.FE.ZS',
                   'SL.IND.EMPL.FE.ZS',
                   'SL.UEM.TOTL.ZS',
                   'NY.GDP.MKTP.CD',
                   'NY.ADJ.NNTY.PC.KD.ZG',
                   'NY.GSR.NFCY.CD',
                   'NV.AGR.TOTL.CD',
                   'EG.USE.ELEC.KH.PC',
                   'EG.FEC.RNEW.ZS',
                   'EG.USE.COMM.FO.ZS',
                   'SP.DYN.LE00.MA.IN',
                   'SP.DYN.LE00.FE.IN',
                   'SE.PRM.ENRR',
                   'SE.TER.ENRR',
                   'SE.PRM.CMPT.ZS',
                   'SE.ADT.1524.LT.ZS']


features_mapping = {
    "SP.POP.TOTL": "Total Population",
    "SP.POP.TOTL.FE.IN": "Female Population",
    "SP.POP.TOTL.MA.IN": "Male Population",
    "SP.DYN.CBRT.IN": "Birth Rate",
    "SP.DYN.CDRT.IN": "Death Rate",
    "SE.COM.DURS": "Compulsory Education Dur.",
    "SL.IND.EMPL.ZS": "Employment in Industry(%)",
    "SL.AGR.EMPL.ZS": "Employment in Agriculture(%)",
    "SL.AGR.EMPL.FE.ZS": "Female Employment in Agriculture(%)",
    "SL.IND.EMPL.FE.ZS": "Female Employment in Industry(%)",
    "SL.UEM.TOTL.ZS": "Unemployment(%)",
    "NY.GDP.MKTP.CD": "GDP in USD",
    "NY.ADJ.NNTY.PC.KD.ZG": "National Income per Capita",
    "NY.GSR.NFCY.CD": "Net income from Abroad",
    "NV.AGR.TOTL.CD": "Agriculture value added(in USD)",
    "EG.USE.ELEC.KH.PC": "Electric Power Consumption(kWH per capita)",
    "EG.FEC.RNEW.ZS": "Renewable Energy Consumption (%)",
    "EG.USE.COMM.FO.ZS": "Fossil Fuel Consumption (%)",
    "SP.DYN.LE00.MA.IN": "Male life expectancy",
    "SP.DYN.LE00.FE.IN": "Female life expectancy ",
    "SE.PRM.ENRR": "School enrollment, primary",
    "SE.TER.ENRR": "School enrollment, tertiary",
    "SE.PRM.CMPT.ZS": "Primary completion rate",
    "SE.ADT.1524.LT.ZS": "Literacy rate"
}

In [37]:
def loadData(country_code: str, format: str = "json", per_page: int = 100, year_interval: str = '2015:2022') -> List:
    result_data = []
    
    year_start = int(year_interval.split(":")[0])
    year_end = int(year_interval.split(":")[1])
    
    for indicator in INDICATOR_CODES:
        indicators = []
        url = f"{BASE_URL}country/{country_code}/indicator/{indicator}?format=json&per_page={per_page}&date={year_interval}"
        response = requests.get(url)
        print(url)
        
        if response.status_code == 200 and "message" not in response.json()[0].keys():
            data = response.json()[1]
            for item in data:
                value = item['value']
                indicators.append(float(value) if value is not None else np.nan)
        else:
            indicators.extend([np.nan] * (year_end - year_start + 1))  # Extend with NaNs if no data
    
        result_data.append(indicators)
    
    # Add the year range in descending order as integers
    result_data.append(list(range(year_end, year_start - 1, -1)))

    return result_data


In [38]:
def loadDataCountry(country_code: str, format: str = "json", per_page: int = 100, year_interval: str = '2015:2022', is_display: bool = False) -> pd.DataFrame:
    df_country_code = pd.read_csv("data/all_countries.csv")
    df_country_code = df_country_code.fillna("NA")
    COUNTRIES_MAPPING = dict(zip(df_country_code["iso2Code"], df_country_code["name"]))
    del df_country_code

    col_list = list(features_mapping.values()) + ['Year']
    print(f"Loading data for {COUNTRIES_MAPPING[country_code]}")

    dataLst = loadData(country_code=country_code, format=format, per_page=per_page, year_interval=year_interval)

    # Ensure all lists have the same length
    max_len = max(len(arr) for arr in dataLst)
    dataLst_padded = [
        np.pad(
            np.array([x if x is not None else np.nan for x in arr], dtype=float),
            (0, max_len - len(arr)),
            constant_values=np.nan
        )
        for arr in dataLst
    ]

    df = pd.DataFrame(np.column_stack(dataLst), columns=col_list)
    df['Country'] = COUNTRIES_MAPPING[country_code]

    if is_display:
        display(df.head())

    return df


In [39]:
def replacer(s, newstring, index, nofail=False):
    if not nofail and index not in range(len(s)):
        raise ValueError("index outside given string")
    if index < 0:  # add it to the beginning
        return newstring + s
    if index > len(s):  # add it to the end
        return s + newstring
    return s[:index] + newstring + s[index + 1:]

In [40]:
def loadDataContinent(format: str = "json", per_page: int = 100, year_interval: str = '2015:2022', is_display: bool = False) -> pd.DataFrame:
    df_country_code = pd.read_csv("data/asian_countries.csv")
    lst_country_codes = df_country_code["iso2Code"].to_list()

    df = pd.DataFrame()
    for country_code in lst_country_codes:
        df = pd.concat([df, loadDataCountry(
            country_code=country_code,
            format=format,
            per_page=per_page,
            year_interval=year_interval,
            is_display=False)], axis=0)

    if is_display:
        display(df.head())

    return df

In [41]:
def saveDataFrame2CSV(df: pd.DataFrame, save_path: str, sep: str = ',', encoding: str = 'utf-8') -> bool:
    try:
        df.to_csv(save_path, sep=sep, encoding=encoding, index=False)
    except:
        raise ModuleNotFoundError
        # return False
    return True

In [42]:
years = '2003:2023'

- First we get all countries

In [43]:
url_countries = f"{BASE_URL}country?format=json&per_page=500"
response = requests.get(url_countries)
countries = []
if response.status_code == 200:
    data = response.json()
    if len(data) > 1 and isinstance(data[1], list):  # Check if data exists
        countries = [
            {
                "name": country["name"],
                "iso2Code": country["iso2Code"],
                "iso3Code": country["id"],
                "region": country["region"]["value"]
            }
            for country in data[1]
        ]
        print(f"Fetched {len(countries)} countries.")
    else:
        print("No country data found.")
else:
    print(f"Error fetching data: {response.status_code}")

print(countries[:5])


Fetched 296 countries.
[{'name': 'Aruba', 'iso2Code': 'AW', 'iso3Code': 'ABW', 'region': 'Latin America & Caribbean '}, {'name': 'Africa Eastern and Southern', 'iso2Code': 'ZH', 'iso3Code': 'AFE', 'region': 'Aggregates'}, {'name': 'Afghanistan', 'iso2Code': 'AF', 'iso3Code': 'AFG', 'region': 'South Asia'}, {'name': 'Africa', 'iso2Code': 'A9', 'iso3Code': 'AFR', 'region': 'Aggregates'}, {'name': 'Africa Western and Central', 'iso2Code': 'ZI', 'iso3Code': 'AFW', 'region': 'Aggregates'}]


In [44]:
countries_df = pd.DataFrame(countries)
countries_df.to_csv("data/all_countries.csv", index=False)

- Second get all asian countries

In [45]:
countries_df = pd.read_csv("data/all_countries.csv")

# Get all asian countries
asian_countries = countries_df[countries_df["region"].str.contains("Asia")]
asian_countries.to_csv("data/asian_countries.csv", index=False)


In [46]:
df_vn = loadDataCountry("VN", year_interval=years, is_display=True)

Loading data for Viet Nam
http://api.worldbank.org/v2/country/VN/indicator/SP.POP.TOTL?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SP.POP.TOTL.FE.IN?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SP.POP.TOTL.MA.IN?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SP.DYN.CBRT.IN?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SP.DYN.CDRT.IN?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SE.COM.DURS?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SL.IND.EMPL.ZS?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SL.AGR.EMPL.ZS?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country/VN/indicator/SL.AGR.EMPL.FE.ZS?format=json&per_page=100&date=2003:2023
http://api.worldbank.org/v2/country

Unnamed: 0,Total Population,Female Population,Male Population,Birth Rate,Death Rate,Compulsory Education Dur.,Employment in Industry(%),Employment in Agriculture(%),Female Employment in Agriculture(%),Female Employment in Industry(%),...,Renewable Energy Consumption (%),Fossil Fuel Consumption (%),Male life expectancy,Female life expectancy,"School enrollment, primary","School enrollment, tertiary",Primary completion rate,Literacy rate,Year,Country
0,100352192.0,51201223.0,49150969.0,,,10.0,31.229898,32.984188,34.128237,27.295831,...,,,,,122.494152,,,,2023.0,Viet Nam
1,99680655.0,50859448.0,48821207.0,14.689,6.831,10.0,30.686275,33.454389,34.657039,26.712691,...,,,69.927,79.29,123.133724,42.224239,115.854256,99.0,2022.0,Viet Nam
2,98935098.0,50486603.0,48448495.0,15.008,7.317,10.0,33.134561,29.044346,28.763746,29.340316,...,24.2,,69.117,78.235,120.023386,38.873238,,,2021.0,Viet Nam
3,98079191.0,50061601.0,48017590.0,15.4,6.169,10.0,31.069891,32.612388,33.831752,26.805666,...,18.9,,70.787,79.92,119.024054,,,98.200523,2020.0,Viet Nam
4,97173776.0,49610246.0,47563530.0,15.663,6.706,10.0,28.954027,34.705397,35.857336,24.89349,...,20.4,,69.369,78.888,117.42335,28.506929,,99.0,2019.0,Viet Nam


In [None]:
df = loadDataContinent(year_interval=years, is_display=True)