In [1]:
import pandas as pd
import json
import os

In [2]:
selected_countries = [
    "United States of America",
    "China",
    "India",
    "Germany",
    "South Africa",
    "Australia",
    "Brazil",
]

years = range(2000, 2019)  


In [3]:
def get_iso(weo_filepath, selected_countries):
    weo = pd.read_csv(weo_filepath)
    filtered_weo = weo[weo["Country"].isin(selected_countries)]
    selected_iso = filtered_weo["ISO"].tolist()
    selected_iso = list(set(selected_iso))

    return selected_iso


weo_filepath = "./WEO_Data.csv"
selected_iso = get_iso(weo_filepath, selected_countries)
selected_iso

['DEU', 'BRA', 'CHN', 'IND', 'ZAF', 'AUS']

In [4]:
country_data = {iso: {} for iso in selected_iso}

for iso in selected_iso:
    for year in years:
        country_data[iso][year] = {}  

country_data

{'DEU': {2000: {},
  2001: {},
  2002: {},
  2003: {},
  2004: {},
  2005: {},
  2006: {},
  2007: {},
  2008: {},
  2009: {},
  2010: {},
  2011: {},
  2012: {},
  2013: {},
  2014: {},
  2015: {},
  2016: {},
  2017: {},
  2018: {}},
 'BRA': {2000: {},
  2001: {},
  2002: {},
  2003: {},
  2004: {},
  2005: {},
  2006: {},
  2007: {},
  2008: {},
  2009: {},
  2010: {},
  2011: {},
  2012: {},
  2013: {},
  2014: {},
  2015: {},
  2016: {},
  2017: {},
  2018: {}},
 'CHN': {2000: {},
  2001: {},
  2002: {},
  2003: {},
  2004: {},
  2005: {},
  2006: {},
  2007: {},
  2008: {},
  2009: {},
  2010: {},
  2011: {},
  2012: {},
  2013: {},
  2014: {},
  2015: {},
  2016: {},
  2017: {},
  2018: {}},
 'IND': {2000: {},
  2001: {},
  2002: {},
  2003: {},
  2004: {},
  2005: {},
  2006: {},
  2007: {},
  2008: {},
  2009: {},
  2010: {},
  2011: {},
  2012: {},
  2013: {},
  2014: {},
  2015: {},
  2016: {},
  2017: {},
  2018: {}},
 'ZAF': {2000: {},
  2001: {},
  2002: {},
  2003: {},
 

In [5]:
def processIndicator(ind_filepath, selected_iso):
    base_name = os.path.basename(ind_filepath)
    ind_name = base_name.split("_")[0]
    df = pd.read_csv(ind_filepath)

    # Drop iso and keep country code for joining purposes in the future
    df.drop(["code", "country"], axis=1, inplace=True)
    df = df.fillna(0)

    # Extract year info from original column names - "CDA.raw.YYYY"
    df.columns = ["iso"] + [col.split(".")[-1] for col in df.columns[1:]]

    filtered_df = df[df["iso"].isin(selected_iso)]

    for _, row in filtered_df.iterrows():
        iso = row["iso"]
        for year in years:
            if str(year) in row:
                country_data[iso][year][ind_name] = row[str(year)]
            else:
                country_data[iso][year][ind_name] = 0

In [6]:
def processAllIndicator(folder_path, selected_iso):
   
    for filename in os.listdir(folder_path):
        if filename.endswith("_raw_na.csv"):
            full_path = os.path.join(folder_path, filename)
            processIndicator(full_path, selected_iso)

folder_path = "./"

processAllIndicator(folder_path, selected_iso)

In [7]:
def processWEO(filename, selected_iso):
    weo_raw = pd.read_csv(filename)
    filtered_weo = (
        weo_raw
        # Filter out records with unit of national currency. Keep records with unit of U.S. dollars for meaningful comparison.
        .query("Units == 'U.S. dollars'")
        .drop(
            [
                "Country",
                "WEO Country Code",
                "Country/Series-specific Notes",
                "Estimates Start After",
                "Units",
            ],
            axis=1,
        )
        .rename(
            columns={
                "ISO": "iso",
                "Subject Descriptor": "type",
            }
        )
        .replace(
            {
                "Gross domestic product current prices": "GDP",
                "Gross domestic product per capita current prices": "GDPPC",
            }
        )
    )

    # Unify the currency scale
    filtered_weo.loc[
        filtered_weo["Scale"] == "Billions", filtered_weo.columns[4:]
    ] *= 1000
    filtered_weo = filtered_weo.drop("Scale", axis=1)

    filtered_weo = filtered_weo[filtered_weo["iso"].isin(selected_iso)]

    for _, row in filtered_weo.iterrows():
        iso = row["iso"]
        for year in years:
            if str(year) in row:
                if row["type"] == "GDP":
                    country_data[iso][year]["GDP"] = row[str(year)]
                else:
                    country_data[iso][year]["GDPPC"] = row[str(year)]
            else:
                if row["type"] == "GDP":
                    country_data[iso][year]["GDP"] = 0
                else:
                    country_data[iso][year]["GDPPC"] = 0


processWEO("./WEO_Data.csv", selected_iso)

In [8]:
def processPopulation(pop_filepath, selected_iso):
    pop = pd.read_csv(pop_filepath)

    filtered_pop = (
        pop.query('`Subject Descriptor` == "Population"')
        .drop(
            [
                "Country",
                "WEO Country Code",
                "Country/Series-specific Notes",
                "Estimates Start After",
                "Units",
                "Scale",
                "Subject Descriptor",
            ],
            axis=1,
        )
        .rename(
            columns={
                "ISO": "iso",
            }
        )
    )

    filtered_pop = filtered_pop[filtered_pop["iso"].isin(selected_iso)]
    for _, row in filtered_pop.iterrows():
        iso = row["iso"]
        for year in years:
            if str(year) in row:
                country_data[iso][year]["Population"] = row[str(year)]
             


processPopulation("./POP_Data.csv", selected_iso)


In [9]:
country_data

{'DEU': {2000: {'CDA': -0.0216615591723935,
   'NOE': 0,
   'VOE': 0,
   'HAD': 1.14953320977876,
   'COE': 0,
   'NDA': -0.0768934832370225,
   'PMD': 649.694532774264,
   'OZD': 16.3181819072254,
   'SOE': 0,
   'GDP': 1948840.0,
   'GDPPC': 23924.88,
   'Population': 81.457},
  2001: {'CDA': -0.0124658785504077,
   'NOE': 0,
   'VOE': 0,
   'HAD': 1.08237342687651,
   'COE': 0,
   'NDA': -0.0930209550988399,
   'PMD': 616.416177616548,
   'OZD': 15.193951329858,
   'SOE': 0,
   'GDP': 1945800.0,
   'GDPPC': 23869.77,
   'Population': 81.518},
  2002: {'CDA': -0.0110319787819451,
   'NOE': 0,
   'VOE': 0,
   'HAD': 1.02645224061181,
   'COE': 0,
   'NDA': -0.102983007259749,
   'PMD': 592.538019337306,
   'OZD': 20.1159015210953,
   'SOE': 0,
   'GDP': 2077020.0,
   'GDPPC': 25460.33,
   'Population': 81.579},
  2003: {'CDA': -0.0106828245620729,
   'NOE': 0.034738649,
   'VOE': 0.00728765,
   'HAD': 0.967360966252686,
   'COE': 0.254125896,
   'NDA': -0.111904663247403,
   'PMD': 56

In [10]:
country_data_json = json.dumps(country_data, indent=4)
output_filepath = "country_data_sample.json"
with open(output_filepath, "w") as file:
    file.write(country_data_json)