# CONVERT FILES WITH DUCK_DB

In [1]:
import pandas as pd
import duckdb
import os
import time
import numpy as np
from utils import time, data_processor, constants

In [3]:
OUTPUT_PATH = r"..\data_processed\\guo_subs_europee\\subs_eu\\" 

eu27_countries = constants.eu

## Ownership history

In [5]:

TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed\\*.parquet" 
TEMP_TABLE_KEY_FINANCIALS = "..\\data_processed\\firmographics_historicals_processed"


def get_ownership_data(year, country, path=None):
    print(f"{country} - {year}...")
    COUNTRY_KEY_FINANCIALS = TEMP_TABLE_KEY_FINANCIALS + f"\\firmographics_historicals_{country}.parquet"
            
            
    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            firmographics_sub.nuts2 AS subsidiary_nuts2,
            firmographics_sub.nace_rev_2_core_code_4_digits_ AS subsidiary_nace4,
            firmographics_guo.nuts2 AS guo_nuts2,
            firmographics_guo.nace_rev_2_core_code_4_digits_ AS guo_nace4,
            firmographics_guo.type_of_entity AS guo_type_of_entity,
            firmographics_guo.status AS guo_status,
            {year} AS year,
            key_financials.number_of_employees AS guo_number_of_employees,
            key_financials.closing_date AS guo_closing_date
        FROM 
            '{path}' AS main
        LEFT JOIN 
            '{TEMP_TABLE_FIRMOGRAPHICS}' AS firmographics_sub
        ON 
            main.subsidiary_bvd_id = firmographics_sub.bvd_id_number
        LEFT JOIN
            '{TEMP_TABLE_FIRMOGRAPHICS}' AS firmographics_guo
        ON
            main.guo_25 = firmographics_guo.bvd_id_number
        LEFT JOIN 
            (SELECT * 
             FROM '{COUNTRY_KEY_FINANCIALS}'
             WHERE 
                year = {year}
            ) AS key_financials
        ON
            main.subsidiary_bvd_id = key_financials.bvd_id_number   -- CHANGE HERE FOR SUBS
        AND
            key_financials.year = {year}

        WHERE 
            main."type_of_relation" = 'GUO 25'
        AND 
            main."subsidiary_bvd_id" LIKE '{country}%'
    """

    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    return df



In [6]:
@time.time_it
def fetch_and_convert_to_excel(year, country, path, output_path):
    df = get_ownership_data(year, country, path)
    os.makedirs(output_path, exist_ok=True)
    slices = data_processor.split_in_chunks(df, 5)

    for i, s in enumerate(slices):
        s.to_csv(f"{output_path}\\{country}_{year}_chunk_{i+1}.csv", index=False)
        print(f"Data for {country} - {year} - slice_{i+1} has been converted.")
        

### Test

In [7]:
# path = "..\\data_raw\\ownership_history\\links_2007\\*.parquet"
country = "DE"
year = "2009"
path = "..\\data_raw\\ownership_history\\links_2007\\*.parquet"

df = get_ownership_data(
    path=path,
    year=year,
    country=country,
)
# fetch_and_convert_to_excel(country=country, year=year, path=path, output_path=output_path)

DE - 2009...


## Query data

In [8]:
import time
import json

errors = {}

total_start = time.time()
start_time = time.time()


for country in eu27_countries:
    for year in range(2022, 2023):
        year = str(year)
        try:
            path = f"..\\data_raw\\ownership_history\\links_{year}\\*.parquet"
            fetch_and_convert_to_excel(year, country, path, OUTPUT_PATH)  
            end_time = time.time()
            duration = end_time - start_time
            print(f"Data for {country} - {year} has been converted to Stata in {duration:.2f} seconds.")
        except Exception as e:
            errors[f"{country} - {year}"] = str(e)
            print(f"Error {country} - {year}")
            continue


end_time = time.time()
duration = end_time - start_time
total_end = time.time()
total_duration = total_end - total_start
print(f"Total execution time: {total_duration:.2f} seconds.")

with open("errors_guos.json", "w") as f:
    json.dump(errors, f)


AT - 2022...
Data for AT - 2022 - slice_1 has been converted.
Data for AT - 2022 - slice_2 has been converted.
Data for AT - 2022 - slice_3 has been converted.
Data for AT - 2022 - slice_4 has been converted.
Data for AT - 2022 - slice_5 has been converted.
Execution time: 95.8373 seconds
Data for AT - 2022 has been converted to Stata in 95.84 seconds.
BE - 2022...
Data for BE - 2022 - slice_1 has been converted.
Data for BE - 2022 - slice_2 has been converted.
Data for BE - 2022 - slice_3 has been converted.
Data for BE - 2022 - slice_4 has been converted.
Data for BE - 2022 - slice_5 has been converted.
Execution time: 396.7448 seconds
Data for BE - 2022 has been converted to Stata in 492.58 seconds.
BG - 2022...
Data for BG - 2022 - slice_1 has been converted.
Data for BG - 2022 - slice_2 has been converted.
Data for BG - 2022 - slice_3 has been converted.
Data for BG - 2022 - slice_4 has been converted.
Data for BG - 2022 - slice_5 has been converted.
Execution time: 151.8585 secon

In [9]:
# open errors
import json 

with open("errors/errors_subs.json", "r") as f:
    errors = json.load(f)

errors

{}

## Zip the files

In [None]:
ZIP_OUTPOUT = "..\\zipped_files\\subs_eu"

data_processor.compress_csv_files(folder_to_zip=OUTPUT_PATH, zip_output=ZIP_OUTPOUT)

Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\EE_2012_chunk_3.csv to ..\zipped_files\subs_eu\EE_2012_chunk_3.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\GR_2022_chunk_4.csv to ..\zipped_files\subs_eu\GR_2022_chunk_4.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\CY_2014_chunk_1.csv to ..\zipped_files\subs_eu\CY_2014_chunk_1.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\MT_2014_chunk_2.csv to ..\zipped_files\subs_eu\MT_2014_chunk_2.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\SK_2009_chunk_2.csv to ..\zipped_files\subs_eu\SK_2009_chunk_2.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\LU_2017_chunk_3.csv to ..\zipped_files\subs_eu\LU_2017_chunk_3.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\IT_2018_chunk_5.csv to ..\zipped_files\subs_eu\IT_2018_chunk_5.csv.gz
Compressed ..\data_processed\\guo_subs_europee\\subs_eu\\AT_2018_chunk_4.csv to ..\zipped_files\subs_eu\AT_2018