# CONVERT FILES WITH DUCK_DB

In [1]:
import pandas as pd
import duckdb
import os
import time
import numpy as np

In [2]:
ZIP_PATH = r"..\data_processed\\guo_subs_europee\\"
ZIP_FOLDER = "guos_eu"

In [3]:
def time_it(func):
    """Decorator to measure the execution time of a function."""
    def wrapper(*args, **kwargs):
        start_time = time.time()  
        result = func(*args, **kwargs) 
        end_time = time.time() 
        elapsed_time = end_time - start_time  
        print(f"Execution time: {elapsed_time:.4f} seconds")  
        return result
    return wrapper

## Ownership history

In [4]:
# path = "E:\dati_moody\ownership_history\links_2022\part-00000-8f9cac6d-cf88-4461-91b4-13c784cdf6a9-c000.snappy.parquet" 

In [5]:
eu27_countries = [
    "AT", "BE", "BG", "HR", "CY", "CZ", 
    "DK", "EE", "FI", "FR", "DE", "GR", 
    "HU", "IE", "IT", "LV", "LT", "LU", 
    "MT", "NL", "PL", "PT", "RO", "SK", 
    "SI", "ES", "SE"
]

In [6]:

# TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed\\*.parquet" 
# TEMP_TABLE_KEY_FINANCIALS = "..\\data_raw\\key_financials\\key_financials_eur\\*.parquet"


# def get_ownership_data(year, country, path=None):
#     print(f"{country} - {year}...")
            
#     query = f"""
#         SELECT
#             main.subsidiary_bvd_id,
#             main.guo_25,
#             firmographics_sub.nuts2 AS subsidiary_nuts2,
#             firmographics_sub.nace_rev_2_core_code_4_digits_ AS subsidiary_nace4,
#             firmographics_guo.nuts2 AS guo_nuts2,
#             firmographics_guo.nace_rev_2_core_code_4_digits_ AS guo_nace4,
#             firmographics_guo.type_of_entity AS guo_type_of_entity,
#             firmographics_guo.status AS guo_status,
#             {year} AS year,
#             key_financials.number_of_employees AS guo_number_of_employees,
#             key_financials.closing_date AS guo_closing_date,
#         FROM 
#             '{path}' AS main
#         LEFT JOIN 
#             '{TEMP_TABLE_FIRMOGRAPHICS}' AS firmographics_sub
#         ON 
#             main.subsidiary_bvd_id = firmographics_sub.bvd_id_number
#         LEFT JOIN
#             '{TEMP_TABLE_FIRMOGRAPHICS}' AS firmographics_guo
#         ON
#             main.guo_25 = firmographics_guo.bvd_id_number
#         LEFT JOIN (
#             SELECT 
#                 number_of_employees, 
#                 closing_date,
#                 bvd_id_number,
#                 EXTRACT(YEAR FROM closing_date) AS financial_year  
#             FROM 
#                 '{TEMP_TABLE_KEY_FINANCIALS}' 
#         ) AS key_financials
#         ON 
#             main.guo_25 = key_financials.bvd_id_number   -- CHANGE HERE FOR SUBS
#         AND 
#             EXTRACT(YEAR FROM key_financials.closing_date) = {year}  
#         WHERE 
#             main."type_of_relation" = 'GUO 25'
#         AND 
#             main."guo_25" LIKE '{country}%'
#     """

#     conn = duckdb.connect()
#     df = conn.execute(query).fetchdf()
#     return df



In [7]:

TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed\\*.parquet" 
TEMP_TABLE_KEY_FINANCIALS = "..\\data_processed\\firmographics_historicals_processed"


def get_ownership_data(year, country, path=None):
    print(f"{country} - {year}...")
    COUNTRY_KEY_FINANCIALS = TEMP_TABLE_KEY_FINANCIALS + f"\\firmographics_historicals_{country}.parquet"
            
            
    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            firmographics_sub.nuts2 AS subsidiary_nuts2,
            firmographics_sub.nace_rev_2_core_code_4_digits_ AS subsidiary_nace4,
            firmographics_guo.nuts2 AS guo_nuts2,
            firmographics_guo.nace_rev_2_core_code_4_digits_ AS guo_nace4,
            firmographics_guo.type_of_entity AS guo_type_of_entity,
            firmographics_guo.status AS guo_status,
            {year} AS year,
            key_financials.number_of_employees AS guo_number_of_employees,
            key_financials.closing_date AS guo_closing_date
        FROM 
            '{path}' AS main
        LEFT JOIN 
            '{TEMP_TABLE_FIRMOGRAPHICS}' AS firmographics_sub
        ON 
            main.subsidiary_bvd_id = firmographics_sub.bvd_id_number
        LEFT JOIN
            '{TEMP_TABLE_FIRMOGRAPHICS}' AS firmographics_guo
        ON
            main.guo_25 = firmographics_guo.bvd_id_number
        LEFT JOIN 
            (SELECT * 
             FROM '{COUNTRY_KEY_FINANCIALS}'
             WHERE 
                year = {year}
            ) AS key_financials
        ON
            main.guo_25 = key_financials.bvd_id_number   -- CHANGE HERE FOR SUBS
        AND
            key_financials.year = {year}

        WHERE 
            main."type_of_relation" = 'GUO 25'
        AND 
            main."guo_25" LIKE '{country}%'
    """
    

    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    return df



In [8]:
@time_it
def fetch_and_convert_to_excel(year, country, path, output_path):
    df = get_ownership_data(year, country, path)
    os.makedirs(output_path, exist_ok=True)

    # Split the DataFrame into 5 chunks
    chunks = 5
    indices = np.array_split(df.index, chunks)
    slices = [df.iloc[idx] for idx in indices]

    # # Modify each slice in place
    # for i in range(len(slices)):
    #     slices[i] = slices[i].apply(
    #         lambda col: col.fillna(-999) if np.issubdtype(col.dtype, np.number) 
    #         else col.fillna(".").astype(str)
    #     )
    
    # Example of exporting each slice to Stata
    for i, s in enumerate(slices):
        s.to_csv(f"{output_path}\\{country}_{year}_chunk_{i+1}.csv", index=False)
    
        # s.to_stata(f"{output_path}\\{country}_{year}_chunk_{i+1}.dta", write_index=False)
        print(f"Data for {country} - {year} - slice_{i+1} has been converted.")
        

### Test

In [None]:
# path = "..\\data_raw\\ownership_history\\links_2007\\*.parquet"
country = "DE"
year = "2009"
path = "..\\data_raw\\ownership_history\\links_2007\\*.parquet"

df = get_ownership_data(
    path=path,
    year=year,
    country=country,
)
# fetch_and_convert_to_excel(country=country, year=year, path=path, output_path=output_path)

In [None]:
df

In [13]:
# year = 2021
# country = "IT" 
# errors = {}

# for country in eu27_countries:
#     for year in range(2007, 2022):
#         try:
#             path = f"..\\data_raw\\ownership_history\\links_{year}\\*.parquet"
#             OUTPUT_PATH = r"..\data_processed\\guo_subs_europee\\guos_eu_\\" 
#             fetch_and_convert_to_stata(year, country, path, OUTPUT_PATH)
#             print(f"{country} - {year} done.")
#         except Exception as e:
#             print(f"Error in {country} - {year}")
#             print(e)
#             continue

In [None]:
df

In [None]:
import time
import json

errors = {}
OUTPUT_PATH = r"..\data_processed\\guo_subs_europee\\guos_eu\\" 

total_start = time.time()
start_time = time.time()

eu27_countries = [
    "AT", "BE", "BG", "HR", "CY", "CZ", 
    "DK", "EE", "FI", "FR", "DE",
    "GR", "HU", "IE", 
    "IT", 
    "LV", "LT", "LU", 
    "MT", "NL", "PL", "PT", "RO", "SK", 
    "SI", "ES", "SE"
]

for country in eu27_countries:
    for year in range(2007, 2022):
        year = str(year)
        try:
            path = f"..\\data_raw\\ownership_history\\links_{year}\\*.parquet"
            fetch_and_convert_to_excel(year, country, path, OUTPUT_PATH)  
            end_time = time.time()
            duration = end_time - start_time
            print(f"Data for {country} - {year} has been converted to Stata in {duration:.2f} seconds.")
        except Exception as e:
            errors[f"{country} - {year}"] = str(e)
            print(f"Error {country} - {year}")
            continue


end_time = time.time()
duration = end_time - start_time
total_end = time.time()
total_duration = total_end - total_start
print(f"Total execution time: {total_duration:.2f} seconds.")

with open("errors_guos.json", "w") as f:
    json.dump(errors, f)


In [None]:
import os

def delete_non_chunk_files(folder_path):
    for file_name in os.listdir(folder_path):
        # Get the full path to the file
        file_path = os.path.join(folder_path, file_name)
        
        # Check if it's a file (not a directory) and doesn't contain 'chunk'
        if os.path.isfile(file_path) and 'chunk' not in file_name:
            print(f"Deleting: {file_name}")
            try:
                os.remove(file_path)
                print(f"Deleted: {file_name}")
            except Exception as e:
                print(f"Error deleting {file_name}: {e}")

# Replace 'your_folder_path' with the path to your folder
folder_path = "..\data_processed\guo_subs_europee\guos_eu"
# delete_non_chunk_files(folder_path)


In [None]:
# open errors
import json 

with open("errors_guos.json", "r") as f:
    errors = json.load(f)

errors

In [None]:
# import shutil

# ZIP_PATH = r"..\data_processed\\guo_subs_europee\\guos_eu"
# ZIP_FOLDER = r"..\data_processed\\guo_subs_europee"

# shutil.make_archive(
#     base_name=fr"{ZIP_FOLDER}\guos_eu_zipped",  
#     format="zip",
#     base_dir=ZIP_PATH  
# )
