In [12]:
import pandas as pd
import numpy as np
import duckdb
import os

In [3]:
eu27_countries = [
    "AT", "BE", "BG", "HR", "CY", "CZ", 
    "DK", "EE", "FI", "FR", "DE", "GR", 
    "HU", "IE", "IT", "LV", "LT", "LU", 
    "MT", "NL", "PL", "PT", "RO", "SK", 
    "SI", "ES", "SE"
]

balkans = [
    "BA", "XK", "MK", "CS", "AL"
]

INDUSTRY_CLASSIFICATION_PATH = "../data_raw/firmographics/industry_classifications/*.parquet"
CONTACT_INFO_PATH = "../data_raw/firmographics/contact_info/*.parquet"
LEGAL_INFO_PATTERN = "../data_raw/firmographics/legal_info/*.parquet"


## NON-STORICAL DATA

In [4]:


def fetch_data_for_country(country_iso_code):
    query = f"""
        SELECT 
            nace.bvd_id_number,
            nace.nace_rev_2_core_code_4_digits_,
            nuts.nuts2,
            legal_info.status,
            legal_info.type_of_entity
        FROM
            '{INDUSTRY_CLASSIFICATION_PATH}' AS nace
        LEFT JOIN
            '{CONTACT_INFO_PATH}' AS nuts
        ON
            nace.bvd_id_number = nuts.bvd_id_number
        LEFT JOIN
            '{LEGAL_INFO_PATTERN}' AS legal_info
        ON
            nace.bvd_id_number = legal_info.bvd_id_number
        WHERE
            nace.bvd_id_number IS NOT NULL
        AND 
            (nace.nace_rev_2_core_code_4_digits_ IS NOT NULL 
            OR nuts.nuts2 IS NOT NULL 
            OR legal_info.status IS NOT NULL 
            OR legal_info.type_of_entity IS NOT NULL)
        AND
            nuts.country_iso_code = '{country_iso_code}'
    """

    # Connect to DuckDB and execute the query
    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    
    return df



In [5]:
# TEST

# country = "IT"
# conn = duckdb.connect()

# df = fetch_data_for_country(country)


In [6]:
output_path = "..\\data_processed\\firmographics_processed\\"

for country in balkans:
    df = fetch_data_for_country(country)
    df.to_parquet(f"{output_path}firmographics_{country}.parquet")
    print(f"{country}")

BA
XK
MK
CS
AL


## HISTORICAL DATA

In [10]:
TEMP_TABLE_KEY_FINANCIALS = "..\\data_raw\\key_financials\\key_financials_eur\\*.parquet"

def fetch_data_for_country_historical(country_iso_code):
    query = f"""
    SELECT 
        number_of_employees, 
        closing_date,
        bvd_id_number,
        total_assets,
        operating_revenue_turnover_,
        EXTRACT(YEAR FROM closing_date) AS year  
    FROM 
        '{TEMP_TABLE_KEY_FINANCIALS}' 
    WHERE
        bvd_id_number LIKE '{country_iso_code}%'
        """
    
    # Connect to DuckDB and execute the query   
    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    return df

In [13]:
## Test
# country = "IT"
OUTPUT_PATH = "..\\data_processed\\firmographics_historicals_processed"

for country in balkans:
    df = fetch_data_for_country_historical(country)
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    df.to_parquet(f"{OUTPUT_PATH}\\firmographics_historicals_{country}.parquet")
    print(f"{country}")


BA
XK
MK
CS
AL
