# CONVERT FILES WITH DUCK_DB

In [1]:
import pandas as pd
import duckdb
import os
import time
import numpy as np

In [None]:
OUTPUT_PATH = r"..\\data_processed\\balkans\\" 

In [3]:
def time_it(func):
    """Decorator to measure the execution time of a function."""
    def wrapper(*args, **kwargs):
        start_time = time.time()  
        result = func(*args, **kwargs) 
        end_time = time.time() 
        elapsed_time = end_time - start_time  
        print(f"Execution time: {elapsed_time:.4f} seconds")  
        return result
    return wrapper

## Ownership history

In [4]:
balkans = [
    "BA", "XK", "MK", "CS", "AL"
]
##

In [5]:

TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed" 
TEMP_TABLE_KEY_FINANCIALS_DETAILED = "..\\data_processed\\key_financials_detailed_processed"


def get_ownership_data(year, country, path=None):
    print(f"{country} - {year}...")

    COUNTRY_KEY_FINANCIALS = TEMP_TABLE_KEY_FINANCIALS_DETAILED + f"\\key_financials_detailed_{country}.parquet"
    COUNTRY_FIRMOGRAPHICS = TEMP_TABLE_FIRMOGRAPHICS + f"\\firmographics_{country}.parquet" 

    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            {year} AS year,

            firmographics_sub.nuts2 AS subsidiary_nuts2,
            firmographics_sub.nace_rev_2_core_code_4_digits_ AS subsidiary_nace4,
            firmographics_guo.nuts2 AS guo_nuts2,
            firmographics_guo.nace_rev_2_core_code_4_digits_ AS guo_nace4,
            firmographics_guo.type_of_entity AS guo_type_of_entity,
            firmographics_guo.status AS guo_status,
            
            key_financials_detailed.operating_revenue_turnover_,
            key_financials_detailed.number_of_employees,
            key_financials_detailed.costs_of_goods_sold,
            key_financials_detailed.material_costs,
            key_financials_detailed.added_value
        FROM 
            '{path}' AS main

        LEFT JOIN 
            '{COUNTRY_KEY_FINANCIALS}' AS key_financials_detailed
        ON 
            main.subsidiary_bvd_id = key_financials_detailed.bvd_id_number
        AND
            key_financials_detailed.year = {year}

        LEFT JOIN 
            '{COUNTRY_FIRMOGRAPHICS}' AS firmographics_sub
        ON 
            main.subsidiary_bvd_id = firmographics_sub.bvd_id_number

        LEFT JOIN 
            '{COUNTRY_FIRMOGRAPHICS}' AS firmographics_guo
        ON 
            main.guo_25 = firmographics_guo.bvd_id_number

        WHERE 
            main."type_of_relation" = 'GUO 25'
        AND 
            main."subsidiary_bvd_id" LIKE '{country}%'

    """

    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    return df

In [None]:
# path = "..\\data_raw\\ownership_history\\links_2007\\*.parquet"
country = "AL"
year = "2019"
path = "..\\data_raw\\ownership_history\\links_2019\\*.parquet"

df_al = get_ownership_data(
    path=path,
    year=year,
    country=country,
)
df_al

In [7]:
@time_it
def fetch_and_convert_to_excel(year, country, path, output_path):
    df = get_ownership_data(year, country, path)
    os.makedirs(output_path, exist_ok=True)

    # Split the DataFrame into 5 chunks
    chunks = 5
    indices = np.array_split(df.index, chunks)
    slices = [df.iloc[idx] for idx in indices]

    for i, s in enumerate(slices):
        s.to_csv(f"{output_path}\\{country}_{year}_chunk_{i+1}.csv", index=False)
    
        print(f"Data for {country} - {year} - slice_{i+1} has been converted.")
        

In [None]:
import time
import json

errors = {}
OUTPUT_PATH = r"..\\data_processed\\balkans\\" 

total_start = time.time()
start_time = time.time()


for country in balkans:
    for year in range(2007, 2023):
        year = str(year)
        try:
            path = f"..\\data_raw\\ownership_history\\links_{year}\\*.parquet"
            fetch_and_convert_to_excel(year, country, path, OUTPUT_PATH)  
            end_time = time.time()
            duration = end_time - start_time
            print(f"Data for {country} - {year} has been converted to excel in {duration:.2f} seconds.")
        except Exception as e:
            errors[f"{country} - {year}"] = str(e)
            print(f"Error {country} - {year}")
            continue


end_time = time.time()
duration = end_time - start_time
total_end = time.time()
total_duration = total_end - total_start
print(f"Total execution time: {total_duration:.2f} seconds.")

with open("errors\\errors_balkans.json", "w") as f:
    json.dump(errors, f)

In [None]:
# open errors
import json 

with open("errors/errors_balkans.json", "r") as f:
    errors = json.load(f)

errors

## zip the files

In [2]:
from utils import zip
OUTPUT_PATH = r"..\\data_processed\\balkans\\" 
zipper = zip.FileZipper(OUTPUT_PATH) 
zipper.zip_folder("balkans.zip")

Archive created at z:\dati_moody\zipped_files\balkans_zipped.zip


In [8]:
import os
os.listdir("..\\zipped_files")

['.git',
 '.gitignore',
 '.python-version',
 '.venv',
 'constants.py',
 'data_processed',
 'data_raw',
 'dati_moody.egg-info',
 'pyproject.toml',
 'queries',
 'README.md',
 'setup.py',
 'utils',
 'uv.lock',
 'zipped_files',
 '__pycache__']

In [1]:
import gzip
import shutil
import os

# Define the paths
INPUT_PATH = "..\\data_processed\\balkans"
OUTPUT_PATH = "..\\zipped_files"


input_csv = os.path.join(INPUT_PATH, "balkans.csv")
output_csv_gz = os.path.join(INPUT_PATH, "balkans.csv.gz")

# Compress the CSV file using gzip
with open(input_csv, 'rb') as f_in:
    with gzip.open(output_csv_gz, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"Compressed {input_csv} to {output_csv_gz}")



FileNotFoundError: [Errno 2] No such file or directory: '..\\data_processed\\balkans\\balkans.csv'