In [2]:
# Kode untuk operasi ETL pada data Country-GDP

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

# Ekstraksi data dari halaman web menjadi DataFrame
def extract(url, table_attribs):
    page = requests.get(url).text
    data = BeautifulSoup(page, 'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    rows = data.find_all('tbody')[2].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col) != 0 and col[0].find('a') is not None and '—' not in col[2]:
            data_dict = {"Country": col[0].a.contents[0], "GDP_USD_millions": col[2].contents[0]}
            df = pd.concat([df, pd.DataFrame(data_dict, index=[0])], ignore_index=True)
    return df

# Transformasi data
def transform(df):
    GDP_list = [float("".join(x.split(','))) for x in df["GDP_USD_millions"].tolist()]
    GDP_list = [np.round(x / 1000, 2) for x in GDP_list]
    df["GDP_USD_billions"] = GDP_list
    df.drop(columns=["GDP_USD_millions"], inplace=True)
    return df

# Simpan DataFrame ke file CSV
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

# Simpan DataFrame ke database SQLite
def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

# Jalankan query pada database
def run_query(query_statement, sql_connection):
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

# Fungsi untuk log progres
def log_progress(message):
    timestamp = datetime.now().strftime('%Y-%h-%d-%H:%M:%S')
    with open("./etl_project_log.txt", "a") as f:
        f.write(f"{timestamp} : {message}\n")

# Konfigurasi awal dan eksekusi ETL
url = 'https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
table_attribs = ["Country", "GDP_USD_millions"]
csv_path = './Countries_by_GDP.csv'
db_name = 'World_Economies.db'
table_name = 'Countries_by_GDP'

log_progress('ETL process started')
df = extract(url, table_attribs)
log_progress('Data extraction complete')
df = transform(df)
log_progress('Data transformation complete')
load_to_csv(df, csv_path)
log_progress('Data saved to CSV')
sql_connection = sqlite3.connect(db_name)
load_to_db(df, sql_connection, table_name)
log_progress('Data loaded to Database')
run_query(f"SELECT * from {table_name} WHERE GDP_USD_billions >= 100", sql_connection)
log_progress('Process complete')
sql_connection.close()


          Country  GDP_USD_billions
0   United States          26854.60
1           China          19373.59
2           Japan           4409.74
3         Germany           4308.85
4           India           3736.88
..            ...               ...
64          Kenya            118.13
65         Angola            117.88
66           Oman            104.90
67      Guatemala            102.31
68       Bulgaria            100.64

[69 rows x 2 columns]
