In [1]:
# import requests
import pandas as pd 
import requests
import wbgapi as wb

In [None]:
wb.source.info()

In [None]:
data = wb.series.list(q='export')
print(data)

In [None]:
for id in data:
    print(id)

In [107]:
# read local data file
#filename = "data\\raw_data\Exports Merchandise, Customs, Price, US$, seas. adj..xlsx"
#df = pd.read_excel(filename)

In [36]:
# request data from API

# Define the indicator for merchandise exports
indicator = "TX.VAL.MRCH.XD.WD" 
# Define the URL for merchandise exports
#https://api.worldbank.org/v2/country/all/indicator/SL.UEM.TOTL.NE.ZS?date=2024:2024
#https://api.worldbank.org/v2/countries/all/indicators/SL.UEM.TOTL.ZS 
# use this to check api/metadata: https://api.worldbank.org/v2/countries/all/indicator/SL.UEM.TOTL.ZS?date=2023:2024&format=json&page=6

date_range = "2010:2024"
#base_url = f"https://api.worldbank.org/v2/countries/all/indicators/{indicator}?{date_range}&format=json"
api_url = f"https://api.worldbank.org/v2/countries/all/indicators/{indicator}"
params = {
    "date": date_range,
    "format": "json",
    "page": 1  # Start at page 1
}

In [37]:
#Fetch data from the World Bank API
response = requests.get(api_url)

In [None]:
all_data = []

while True:
    response = requests.get(api_url, params=params)
    response_data = response.json()
    print(response_data)
    if len(response_data) < 2 or not response_data[1]:  # Check if there's data
        break
    
    all_data.extend(response_data[1])  # Add current page data to all_data
    
    # Update parameters for the next page
    params["page"] += 1

df_export = pd.json_normalize(all_data)

print(df_export)

In [None]:
df_export

### Transforming data

In [None]:
# understand the columns
df_export.columns
# remove leading and trailing spaces
df_export.columns = df_export.columns.str.strip()
df_export.columns

In [None]:
# Rename the column 'date'
df_export.rename(columns={'date': 'year'}, inplace=True)
df_export

In [None]:
# Remove NaN from the Year and value column
df_cleaned = df_export.dropna(subset=['year'])
df_cleaned = df_cleaned.dropna(subset=['value'])
df_cleaned

In [None]:
df_cleaned = df_cleaned[['year', 'value', 'country.value']]
df_cleaned['country'] = df_export['country.value']
df_cleaned.drop(columns=['country.value'], inplace=True)
df_cleaned

In [None]:
df_cleaned.reset_index(drop=True, inplace=True)
df_cleaned

In [None]:
# Filter the DataFrame to include only rows where 'country' is Australia, New Zealand, or Italy
df_cleaned = df_cleaned[df_cleaned['country'].isin(['Australia', 'New Zealand', 'Italy'])]
df_cleaned

In [None]:
# Pivot the DataFrame
pivoted_df = df_cleaned.pivot(index='year', columns='country', values='value')

# Resetting the index to make 'year' a column again
pivoted_df.reset_index(inplace=True)
pivoted_df.columns.name = None
pivoted_df

In [None]:
# Add an ID column
pivoted_df.insert(0, 'id', range(1, len(pivoted_df) + 1))


In [None]:
pivoted_df

### Load data to file (parquet)

In [88]:
# load (overwrite) data to a csv / Parquet file 
pivoted_df.to_parquet('data\clean_data\exports.parquet')

### Load data to SQL 

In [82]:
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, Float # https://www.tutorialspoint.com/sqlalchemy/sqlalchemy_core_creating_table.htm
from sqlalchemy.engine import URL
from sqlalchemy.dialects import postgresql
from sqlalchemy.schema import CreateTable 
#from secrets_config import db_user, db_password, db_server_name, db_database_name

In [83]:
#can't import secrets_config, leaving this for now

#database
db_user='postgres'
db_password='postgres'
db_server_name='localhost'
db_database_name='project1'

In [84]:
# create connection to database 
connection_url = URL.create(
    drivername="postgresql+pg8000",  # "postgresql+pg8000" indicates the driver to be used.
    username=db_user,
    password=db_password,
    host=db_server_name,
    port=5432,  # Ensure the port number is correct (default for PostgreSQL is 5432).
    database=db_database_name,
)

engine = create_engine(connection_url)


In [None]:
connection_url

Append

In [None]:
# using pandas 
pivoted_df.to_sql("export", engine, if_exists="append")
pivoted_df.head()

Upsert

In [98]:
# it is not authomaic with pandas, we need to write exactly what the table looks like  

meta = MetaData()
export_table = Table(
    "export", meta, 
    Column("id", Integer, primary_key=True),
    Column("year", Integer, primary_key=True),
    Column("Australia", Float),
    Column("Italy", Float),
    Column("New Zealand", Float),
)
meta.create_all(engine) # creates table if it does not exists

In [99]:
# Convert the DataFrame to a list of dictionaries
records = pivoted_df.to_dict(orient='records')

In [None]:
insert_statement = postgresql.insert(export_table).values(pivoted_df.to_dict(orient='records'))

upsert_statement = insert_statement.on_conflict_do_update(
    index_elements=["id", "year"],
    set_={c.key: c for c in insert_statement.excluded if c.key not in ["id", "year"]})

# Execute the upsert statement
with engine.connect() as connection:
    connection.execute(upsert_statement)