In [1]:
import pandas as pd
import numpy as np
import duckdb
import pyarrow.dataset as ds
import pyarrow.parquet as pq

import os
import sys
import sys
sys.path.insert(0, os.path.abspath("../.."))
from utils import time, data_processor, constants

In [2]:
all_countries = list(set(constants.eu + constants.americas + constants.asia + constants.africa + constants.oceania))
eu = set(constants.eu)
balkans = set(constants.balkans)

## QUERY


In [3]:
temp_dir = "data_processed/temp"
output_path = "..\\..\\data_processed\\firmographics_processed\\firmographics_full\\"
os.makedirs(temp_dir, exist_ok=True)


def downloadCountryFirmographics(country):
    
    industry_ds = ds.dataset(f"Z:/dati_moody/data_processed/firmographics_processed/industry_classification/{country}.parquet", format="parquet")
    contact_ds = ds.dataset(f"Z:/dati_moody/data_processed/firmographics_processed/contact_info/{country}.parquet", format="parquet")
    legal_ds = ds.dataset(f"Z:/dati_moody/data_processed/firmographics_processed/legal_info/{country}.parquet", format="parquet")
    addresses_ds = ds.dataset(f"Z:/dati_moody/data_processed/firmographics_processed/all_addresses/{country}.parquet", format="parquet")


    con = duckdb.connect()

    # Register Arrow datasets as tables (they act like views)
    con.register("nace", industry_ds)
    con.register("contact_info", contact_ds)
    con.register("legal_info", legal_ds)
    con.register("addresses", addresses_ds)

    query = f"""
        SELECT 
            nace.bvd_id_number,
            nace.nace_rev_2_core_code_4_digits_,
            contact_info.nuts2,
            legal_info.status,
            legal_info.type_of_entity,
            addresses.postcode,
            addresses.country_iso_code,
            addresses.city_native_,
            addresses.city,
            addresses.region_in_country
        FROM addresses
        LEFT JOIN nace
            ON addresses.bvd_id_number = nace.bvd_id_number
        LEFT JOIN contact_info
            ON nace.bvd_id_number = contact_info.bvd_id_number
        LEFT JOIN legal_info
            ON nace.bvd_id_number = legal_info.bvd_id_number
        WHERE
            addresses.country_iso_code = '{country}' AND 
            nace.bvd_id_number IS NOT NULL AND
            nace.nace_rev_2_core_code_4_digits_ IS NOT NULL AND 
            contact_info.nuts2 IS NOT NULL 
    """

    output = f"{output_path}{country}.parquet"
    os.makedirs(output_path, exist_ok=True)

    duckdb_query = f"""
        COPY (
            {query}
        ) TO '{output}' (FORMAT 'parquet', COMPRESSION 'snappy');
    """

    con.execute(duckdb_query)
    print(f"{country} downloaded at: {output}")

In [4]:
# for country in all_countries:
for country in ['RU']:
    try:
        downloadCountryFirmographics(country)
    except:
        print(f"Error downloading {country}")
        continue

RU downloaded at: ..\..\data_processed\firmographics_processed\firmographics_full\RU.parquet
