In [2]:
import pandas as pd
import numpy as np
import duckdb
import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, "../"))
sys.path.append(parent_dir)

In [3]:
import constants
all_countries = set(constants.eu + constants.americas + constants.asia + constants.africa + constants.oceania)
eu = set(constants.eu)
balkans = set(constants.balkans)

In [4]:
# eu27_countries = [
#     "AT", "BE", "BG", "HR", "CY", "CZ", 
#     "DK", "EE", "FI", "FR", "DE", "GR", 
#     "HU", "IE", "IT", "LV", "LT", "LU", 
#     "MT", "NL", "PL", "PT", "RO", "SK", 
#     "SI", "ES", "SE"
# ]

# balkans = [
#     "BA", "XK", "MK", "CS", "AL"
# ]

INDUSTRY_CLASSIFICATION_PATH = "../../data_raw/firmographics/industry_classifications/*.parquet"
CONTACT_INFO_PATH = "../../data_raw/firmographics/contact_info/*.parquet"
LEGAL_INFO_PATTERN = "../../data_raw/firmographics/legal_info/*.parquet"


In [None]:
# def fetch_data(country_iso_code):
#     query = f"""
#     SELECT
#         *
#     FROM
#         '{CONTACT_INFO_PATH}' as nace
#     WHERE
#         nace."bvd_id_number" LIKE '{country_iso_code}%'

#     """
#     conn = duckdb.connect()
#     df = conn.execute(query).fetchdf()
    
#     return df

# # ita_industry_classification = fetch_data("IT")
# ita_contact_info = fetch_data("IT")
# # ita_legal_info = fetch_data("IT")

## NON-HISTORICAL DATA

In [11]:


def fetch_data_for_country(country_iso_code):
    query = f"""
        SELECT 
            nace.bvd_id_number,
            nace.nace_rev_2_core_code_4_digits_,
            nuts.nuts2,
            legal_info.status,
            legal_info.type_of_entity
        FROM
            '{INDUSTRY_CLASSIFICATION_PATH}' AS nace
        LEFT JOIN
            '{CONTACT_INFO_PATH}' AS nuts
        ON
            nace.bvd_id_number = nuts.bvd_id_number
        LEFT JOIN
            '{LEGAL_INFO_PATTERN}' AS legal_info
        ON
            nace.bvd_id_number = legal_info.bvd_id_number
        WHERE
            nace.bvd_id_number IS NOT NULL
        AND 
            (nace.nace_rev_2_core_code_4_digits_ IS NOT NULL 
            OR nuts.nuts2 IS NOT NULL 
            OR legal_info.status IS NOT NULL 
            OR legal_info.type_of_entity IS NOT NULL)
        AND
            nuts.country_iso_code = '{country_iso_code}'
    """

    # Connect to DuckDB and execute the query
    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    
    return df



In [11]:
def fetch_data_for_country(country_iso_code):
    query = f"""
        WITH ranked_data AS (
            SELECT 
                nace.bvd_id_number,
                nace.nace_rev_2_core_code_4_digits_,
                nuts.nuts2,
                legal_info.status,
                legal_info.type_of_entity,
                -- Compute the number of missing values among the four columns:
                (
                    (CASE WHEN nace.nace_rev_2_core_code_4_digits_ IS NULL THEN 1 ELSE 0 END) +
                    (CASE WHEN nuts.nuts2 IS NULL THEN 1 ELSE 0 END) +
                    (CASE WHEN legal_info.status IS NULL THEN 1 ELSE 0 END) +
                    (CASE WHEN legal_info.type_of_entity IS NULL THEN 1 ELSE 0 END)
                ) AS missing_count,
                ROW_NUMBER() OVER (
                    PARTITION BY nace.bvd_id_number 
                    ORDER BY 
                        (
                            (CASE WHEN nace.nace_rev_2_core_code_4_digits_ IS NULL THEN 1 ELSE 0 END) +
                            (CASE WHEN nuts.nuts2 IS NULL THEN 1 ELSE 0 END) +
                            (CASE WHEN legal_info.status IS NULL THEN 1 ELSE 0 END) +
                            (CASE WHEN legal_info.type_of_entity IS NULL THEN 1 ELSE 0 END)
                        ) ASC
                ) AS rn
            FROM
                '{INDUSTRY_CLASSIFICATION_PATH}' AS nace
            LEFT JOIN
                '{CONTACT_INFO_PATH}' AS nuts
                ON nace.bvd_id_number = nuts.bvd_id_number
            LEFT JOIN
                '{LEGAL_INFO_PATTERN}' AS legal_info
                ON nace.bvd_id_number = legal_info.bvd_id_number
            WHERE
                nace.bvd_id_number IS NOT NULL
                AND (
                    nace.nace_rev_2_core_code_4_digits_ IS NOT NULL 
                    OR nuts.nuts2 IS NOT NULL 
                    OR legal_info.status IS NOT NULL 
                    OR legal_info.type_of_entity IS NOT NULL
                )
                AND nuts.country_iso_code = '{country_iso_code}'
        )
        SELECT 
            bvd_id_number,
            nace_rev_2_core_code_4_digits_,
            nuts2,
            status,
            type_of_entity
        FROM ranked_data
        WHERE rn = 1
    """

    # Connect to DuckDB and execute the query
    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    
    return df


In [None]:
# TEST
country = "IT"
conn = duckdb.connect()

df = fetch_data_for_country(country)


In [None]:
output_path = "..\\..\\data_processed\\firmographics_processed\\"

for country in all_countries:
    df = fetch_data_for_country(country)
    df.to_parquet(f"{output_path}firmographics_{country}.parquet")
    print(f"{country}")

In [14]:
# TEMP_TABLE_KEY_FINANCIALS = "..\\..\\data_raw\\key_financials\\key_financials_eur\\*.parquet"

# def fetch_data_for_country_historical(country_iso_code):
#     query = f"""
#     SELECT 
#         number_of_employees, 
#         closing_date,
#         bvd_id_number,
#         total_assets,
#         operating_revenue_turnover_,
#         EXTRACT(YEAR FROM closing_date) AS year  
#     FROM 
#         '{TEMP_TABLE_KEY_FINANCIALS}' 
#     WHERE
#         bvd_id_number LIKE '{country_iso_code}%'
#         """
    
#     # Connect to DuckDB and execute the query   
#     conn = duckdb.connect()
#     df = conn.execute(query).fetchdf()
#     return df

In [None]:
# ## Test
# # country = "IT"
# OUTPUT_PATH = "..\\..\\data_processed\\key_financials_processed\\"

# for country in all_countries:
#     df = fetch_data_for_country_historical(country)
#     os.makedirs(OUTPUT_PATH, exist_ok=True)
#     df.to_parquet(f"{OUTPUT_PATH}\\key_financials_processed_{country}.parquet")
#     print(f"{country}")
