In [8]:
import requests
import pandas as pd
from datetime import datetime, timezone

In [9]:
#https://api.worldbank.org/v2/country/all/indicator/SL.UEM.TOTL.NE.ZS?date=2024:2024
#https://api.worldbank.org/v2/countries/all/indicators/SL.UEM.TOTL.ZS 
# use this to check api/metadata: https://api.worldbank.org/v2/countries/all/indicator/SL.UEM.TOTL.ZS?date=2023:2024&format=json&page=6
indicator="SL.UEM.TOTL.ZS"
date_range = "2023:2023"
#base_url = f"https://api.worldbank.org/v2/countries/all/indicators/{indicator}?{date_range}&format=json"
base_url = f"https://api.worldbank.org/v2/countries/all/indicators/{indicator}?"
params = {
    "date": date_range,
    "format": "json",
    "page": 1  # Start at page 1
}

#response_data = []
all_data = []

while True:
    response = requests.get(base_url, params=params)
    response_data = response.json()
    
    if len(response_data) < 2 or not response_data[1]:  # Check if there's data
        break
    
    all_data.extend(response_data[1])  # Add current page data to all_data
    
    # Update parameters for the next page
    params["page"] += 1

df_unemployment = pd.json_normalize(data=all_data)



In [10]:
print(len(df_unemployment))


266


In [11]:
df_unemployment.columns


Index(['countryiso3code', 'date', 'value', 'unit', 'obs_status', 'decimal',
       'indicator.id', 'indicator.value', 'country.id', 'country.value'],
      dtype='object')

In [20]:
df_unemployment.info() 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   countryiso3code  266 non-null    object 
 1   date             266 non-null    object 
 2   value            233 non-null    float64
 3   unit             266 non-null    object 
 4   obs_status       266 non-null    object 
 5   decimal          266 non-null    int64  
 6   indicator.id     266 non-null    object 
 7   indicator.value  266 non-null    object 
 8   country.id       266 non-null    object 
 9   country.value    266 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 20.9+ KB


In [21]:
#print(df_unemployment["countryiso3code"])
df_unemployment.loc[df_unemployment['countryiso3code'] == 'SGP'] 


Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
220,SGP,2023,3.472,,,1,SL.UEM.TOTL.ZS,"Unemployment, total (% of total labor force) (...",SG,Singapore


In [22]:
def transform(df: pd.DataFrame, region_file_path) -> pd.DataFrame:
    if df.empty:
        print("Incremental extract is empty. No data to transform.")
    else:
        print("Starting transform")

        # select some columns
        df_selected = df[
            [
                "date",
                "countryiso3code",
                "country.value",
                "indicator.id",
                "indicator.value",
                "value",
            ]
        ]

        # rename column names
        df_renamed = df_selected.rename(
            columns={
                "date": "year",
                "countryiso3code": "country_code",
                "country.value": "country_name",
                "indicator.id": "indicator_id",
                "indicator.value": "indicator_value",
            }
        )

        # Remove NaN from the Year and value column
        df_cleaned = df_renamed.dropna(subset=["year"]).dropna(subset=["value"])

        # change datatype of year
        df_cleaned = df_cleaned.astype({"year": int})

        df_region = pd.read_csv(
            region_file_path, usecols=["Code", "Region"]
        )  # "data/CLASS_CSV.csv"

        df_region = df_region.rename(columns={"Region": "region"})

        # merge with region class file
        df_final = pd.merge(
            left=df_cleaned,
            right=df_region,
            left_on="country_code",
            right_on="Code",
        )

        df_final = df_final.drop(["Code"], axis=1)

        print("Completed transform")
        df = df_final

    return pd.DataFrame(df)

df = transform(df=df_unemployment, region_file_path='data/CLASS_CSV.csv')

print(df.head())



Starting transform
Completed transform
   year country_code                    country_name    indicator_id  \
0  2023          AFE     Africa Eastern and Southern  SL.UEM.TOTL.ZS   
1  2023          AFW      Africa Western and Central  SL.UEM.TOTL.ZS   
2  2023          ARB                      Arab World  SL.UEM.TOTL.ZS   
3  2023          CSS          Caribbean small states  SL.UEM.TOTL.ZS   
4  2023          CEB  Central Europe and the Baltics  SL.UEM.TOTL.ZS   

                                     indicator_value     value region  
0  Unemployment, total (% of total labor force) (...  7.472824    NaN  
1  Unemployment, total (% of total labor force) (...  3.397782    NaN  
2  Unemployment, total (% of total labor force) (...  9.878318    NaN  
3  Unemployment, total (% of total labor force) (...  9.913604    NaN  
4  Unemployment, total (% of total labor force) (...  4.081257    NaN  


In [24]:
print(df.loc[df['country_code'] == 'SGP'] )


     year country_code country_name    indicator_id  \
190  2023          SGP    Singapore  SL.UEM.TOTL.ZS   

                                       indicator_value  value  \
190  Unemployment, total (% of total labor force) (...  3.472   

                  region  
190  East Asia & Pacific  


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229 entries, 0 to 228
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             229 non-null    int32  
 1   country_code     229 non-null    object 
 2   country_name     229 non-null    object 
 3   indicator_id     229 non-null    object 
 4   indicator_value  229 non-null    object 
 5   value            229 non-null    float64
 6   region           185 non-null    object 
dtypes: float64(1), int32(1), object(5)
memory usage: 13.4+ KB


In [17]:

df_unemployment.to_excel("data/suph_export_unemp_from api.xlsx")

In [7]:
from dotenv import load_dotenv
import yaml
from pathlib import Path

# get config variables
yaml_file_path = "pipelines/gem.yaml"
if Path(yaml_file_path).exists():
    with open(yaml_file_path) as yaml_file:
            pipeline_config = yaml.safe_load(yaml_file)
            config = pipeline_config.get("config")
            PIPELINE_NAME = pipeline_config.get("name")
            
            
            wb_indicator = config.get("indicator_export")
            wb_daterange = config.get("date_range")
            region_file_path = config.get("region_classification_path")

            incremental_column = pipeline_config.get("extract").get("incremental_column")
            extract_type = pipeline_config.get("extract").get("extract_type")
            #extract_table_name = "unemployment" #? how do we make this dynamic to include all our tables
               # Use the 'table_names' on yaml to set the table name dynamically
            extract_table_name = (
                pipeline_config.get("table_names").get(wb_indicator)
            )

            

else:
    raise Exception(
            f"Missing {yaml_file_path} file! Please create the yaml file with at least a `name` key for the pipeline name."
        )

print(extract_table_name)

exports
