# CONVERT FILES WITH DUCK_DB

In [19]:
import pandas as pd
import duckdb
import os
import time as pytime
import numpy as np
import psutil
import sys
import glob
import json
sys.path.insert(0, os.path.abspath(".."))
from utils import time, data_processor, constants
import gc

persistent_conn = duckdb.connect()

In [20]:
TEMP_TABLE_OWNERSHIP_HISTORY_GUO = "..\\data_processed\\ownership_history\\guos\\"
TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed\\firmographics_full\\"
TEMP_TABLE_KEY_FINANCIALS = "..\\data_processed\\key_financials_detailed_processed\\"
OUTPUT_PATH = "..\\zipped_files\\guos_eu\\"
os.makedirs(OUTPUT_PATH, exist_ok=True)
eu27_countries = constants.eu
asia = constants.asia
africa = constants.africa
oceania = constants.oceania
americas = constants.americas
balkans = constants.balkans

all_countries = asia + africa + oceania + americas + balkans 
# eu_countries, balkans


In [21]:
def log_memory():
    mem = psutil.virtual_memory()
    print(f"[MEMORY] Used: {mem.percent}%, Available: {round(mem.available / (1024 ** 3), 2)} GB")


In [22]:
# drop 'IT' from eu27_countries
# eu27_countries.remove('IT')
# eu27_countries.append('DE')

In [None]:
''' DONE
- EU
- BALKANS
= asia
=americas
- africa
= oceania
'''


' DONE\n- EU\n- BALKANS\n= asia\n'

In [27]:
for country in oceania:
    for year in range(2007, 2024):
        ownership_path = os.path.join(TEMP_TABLE_OWNERSHIP_HISTORY_GUO, f"{country}_{year}.parquet")
        firmo_path = os.path.join(TEMP_TABLE_FIRMOGRAPHICS, f"{country}.parquet")
        financials_path = os.path.join(TEMP_TABLE_KEY_FINANCIALS, f"key_financials_detailed_{country}.parquet")

        print(f"[INFO] Loading and joining data from:\n  - Ownership: {ownership_path}\n  - Firmographics: {firmo_path}")

        combined_query = f"""
        WITH
        firmographics_dedup AS (
            SELECT DISTINCT ON (bvd_id_number) *
            FROM parquet_scan('{firmo_path}')
        ),
        financials_dedup AS (
            SELECT DISTINCT ON (bvd_id_number) *
            FROM parquet_scan('{financials_path}')
        )
        SELECT o.*,
            f.nuts2,
            f.nace_rev_2_core_code_4_digits_,
            f.city_native_,
            f.type_of_entity,
            f.status,
            k.operating_revenue_turnover_,
            k.number_of_employees
        FROM parquet_scan('{ownership_path}') AS o
        LEFT JOIN firmographics_dedup AS f
        ON o.guo_25 = f.bvd_id_number
        LEFT JOIN financials_dedup AS k
        ON o.guo_25 = k.bvd_id_number
        WHERE o.type_of_relation = 'GUO 25'
        """

        # Execute query
        con = duckdb.connect()
        combined_df = con.execute(combined_query).fetchdf()
        # Output info
        print(f"[INFO] Combined rows (ownership only): {len(combined_df)}")
        print(f"[MEMORY] Used: {psutil.virtual_memory().percent}%, Available: {round(psutil.virtual_memory().available / 1e9, 2)} GB")

        final_path = os.path.join(OUTPUT_PATH, f"{country}_{year}.csv.gz")
        combined_df.to_csv(final_path, index=False, compression='gzip')
        print(f"Saved at {final_path}")

[INFO] Loading and joining data from:
  - Ownership: ..\data_processed\ownership_history\guos\AS_2007.parquet
  - Firmographics: ..\data_processed\firmographics_processed\firmographics_full\AS.parquet
[INFO] Combined rows (ownership only): 0
[MEMORY] Used: 25.8%, Available: 119.51 GB
Saved at ..\zipped_files\guos_eu\AS_2007.csv.gz
[INFO] Loading and joining data from:
  - Ownership: ..\data_processed\ownership_history\guos\AS_2008.parquet
  - Firmographics: ..\data_processed\firmographics_processed\firmographics_full\AS.parquet
[INFO] Combined rows (ownership only): 0
[MEMORY] Used: 25.8%, Available: 119.51 GB
Saved at ..\zipped_files\guos_eu\AS_2008.csv.gz
[INFO] Loading and joining data from:
  - Ownership: ..\data_processed\ownership_history\guos\AS_2009.parquet
  - Firmographics: ..\data_processed\firmographics_processed\firmographics_full\AS.parquet
[INFO] Combined rows (ownership only): 0
[MEMORY] Used: 25.8%, Available: 119.51 GB
Saved at ..\zipped_files\guos_eu\AS_2009.csv.gz
[