In [61]:
import pandas as pd
import numpy as np
import os
import requests
import openpyxl
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter, Retry

## For implementing Data Analysis on `Excel` spreadsheets

In [62]:
df = pd.read_excel('/Users/aanwar/Desktop/dec_project_1/Global_Economic_Monitor/app/etl_project/data/Industrial Production, constant 2010 US$, seas. adj..xlsx')
df = df.drop(df.index[0])
df.rename(columns={'Unnamed: 0': 'Year'}, inplace=True)
df.fillna(method='ffill', inplace=True)

In [64]:
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 1 to 30
Data columns (total 92 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Year                                              30 non-null     float64
 1   Advanced Economies                                30 non-null     float64
 2   Austria                                           30 non-null     float64
 3   Belgium                                           30 non-null     float64
 4   Bangladesh                                        30 non-null     float64
 5   Bulgaria                                          25 non-null     float64
 6   Bosnia and Herzegovina                            19 non-null     float64
 7   Belarus                                           17 non-null     float64
 8   Brazil                                            30 non-null     float64
 9   Barbados               

## Doing some Data Wrangling:

In [18]:
country_columns = df.columns[1:]  # Exclude 'Year'

# Automate table creation and data insertion for each country
for country in country_columns:
    # Create a DataFrame for the current country
    country_df = df[['Year', country]].copy()
    country_df.columns = ['year', 'gdp']  # Rename columns for consistency

    # Create a table for the country (if it doesn't exist) and insert data
    country_name = country.lower().replace(' ', '_')

## Data Engineering via API Call

### Calling API calls using incremental loads:

In [69]:
class WorldBankDataLoader:
    def __init__(self, indicator, start_year, end_year):
        """
        Initialize the WorldBankDataLoader with the indicator, start year, and end year.
        
        Args:
        - indicator (str): The indicator code for the World Bank API (e.g., 'NV.IND.TOTL.KD.ZG').
        - start_year (str): The starting year for the data.
        - end_year (str): The ending year for the data.
        """
        self.indicator = indicator
        self.date_range = f"{start_year}:{end_year}"
        self.base_url = f"https://api.worldbank.org/v2/countries/all/indicators/{self.indicator}?"
        self.params = {
            "date": self.date_range,
            "format": "json",
            "page": 1  # Start at page 1
        }
        self.all_data = []  # To store paginated data

    def fetch_data(self):
        """
        Fetch data from the World Bank API, handling pagination until all data is retrieved.
        
        Returns:
        - pd.DataFrame: A pandas DataFrame containing all the data fetched from the API.
        """
        while True:
            response = requests.get(self.base_url, params=self.params)
            response_data = response.json()

            # Check if valid data is returned
            if len(response_data) < 2 or not response_data[1]:
                break

            # Extend the all_data list with the current page's data
            self.all_data.extend(response_data[1])

            # Increment page number for the next API call
            self.params["page"] += 1

        # Normalize and return the data as a pandas DataFrame
        return pd.json_normalize(data=self.all_data)

# Example usage
if __name__ == "__main__":
    # Create an instance of the loader for the specific indicator and date range
    loader = WorldBankDataLoader(indicator="NV.IND.TOTL.KD.ZG", start_year="1990", end_year="2024")
    
    # Fetch the data
    df_data = loader.fetch_data()

    # Print the resulting DataFrame
    #print(df_data)

In [71]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9044 entries, 0 to 9043
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   countryiso3code  9044 non-null   object 
 1   date             9044 non-null   object 
 2   value            7088 non-null   float64
 3   unit             9044 non-null   object 
 4   obs_status       9044 non-null   object 
 5   decimal          9044 non-null   int64  
 6   indicator.id     9044 non-null   object 
 7   indicator.value  9044 non-null   object 
 8   country.id       9044 non-null   object 
 9   country.value    9044 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 706.7+ KB


In [72]:
df_data.head()

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,AFE,2023,1.9567,,,1,NV.IND.TOTL.KD.ZG,"Industry (including construction), value added...",ZH,Africa Eastern and Southern
1,AFE,2022,2.693175,,,1,NV.IND.TOTL.KD.ZG,"Industry (including construction), value added...",ZH,Africa Eastern and Southern
2,AFE,2021,4.33502,,,1,NV.IND.TOTL.KD.ZG,"Industry (including construction), value added...",ZH,Africa Eastern and Southern
3,AFE,2020,-4.734432,,,1,NV.IND.TOTL.KD.ZG,"Industry (including construction), value added...",ZH,Africa Eastern and Southern
4,AFE,2019,1.856316,,,1,NV.IND.TOTL.KD.ZG,"Industry (including construction), value added...",ZH,Africa Eastern and Southern
