# CONVERT FILES WITH DUCK_DB

In [1]:
import pandas as pd
import duckdb
import glob
import time

In [2]:
def time_it(func):
    """Decorator to measure the execution time of a function."""
    def wrapper(*args, **kwargs):
        start_time = time.time()  
        result = func(*args, **kwargs) 
        end_time = time.time() 
        elapsed_time = end_time - start_time  
        print(f"Execution time: {elapsed_time:.4f} seconds")  
        return result
    return wrapper

In [12]:

# !pip install duckdb

## Ownership history

In [13]:
# path = "E:\dati_moody\ownership_history\links_2022\part-00000-8f9cac6d-cf88-4461-91b4-13c784cdf6a9-c000.snappy.parquet" 


In [3]:
eu27_countries = [
    "AT", "BE", "BG", "HR", "CY", "CZ", 
    "DK", "EE", "FI", "FR", "DE", "GR", 
    "HU", "IE", "IT", "LV", "LT", "LU", 
    "MT", "NL", "PL", "PT", "RO", "SK", 
    "SI", "ES", "SE"
]

In [15]:
# PATH = r"E:\dati_moody\ownership_history\links_"
# additional_data_path = "E:/dati_moody/data_processed/nuts2.parquet"


# conn = duckdb.connect()
# conn.execute(f"""
#     CREATE TABLE temp_data AS 
#     SELECT 
#         main.subsidiary_bvd_id,
#         main.guo_25,
#         main.type_of_relation,
#         firmographics_sub.nuts2 AS subsidiary_nuts2,
#         firmographics_guo.nuts2 AS guo_nuts2,
#     FROM 
#         '{PATH}*/*.parquet' AS main
#     LEFT JOIN 
#         '{additional_data_path}' AS firmographics_sub
#     ON 
#         main.subsidiary_bvd_id = firmographics_sub.bvd_id_number
#     LEFT JOIN
#         '{additional_data_path}' AS firmographics_guo
#     ON
#         main.guo_25 = firmographics_guo.bvd_id_number
#     WHERE 
#         main.type_of_relation = 'GUO 25'
# """)

# def get_ownership_data_from_temp(year, country, conn):
#     query = f"""
#         SELECT 
#             subsidiary_bvd_id,
#             guo_25,
#             subsidiary_nuts2,
#             subsidiary_nace4,
#             guo_nuts2,
#             guo_nace4,
#             {year} AS year,
#             '{country}' AS country
#         FROM 
#             temp_data
#         WHERE 
#             guo_25 LIKE '{country}%'
#     """
#     return conn.execute(query).fetchdf()


# df = get_ownership_data_from_temp(2019, "IT", conn)
# df

In [7]:
# path = r"E:\dati_moody\ownership_history\links_2022\*.parquet"
# parquet_files = glob.glob(path)
# file_list = ", ".join(f"'{file}'" for file in parquet_files)


PATH = r"E:\dati_moody\ownership_history\links_"
additional_data_path = "E:/dati_moody/data_processed/additional_data.parquet"


def get_ownership_data(year, country, filter_type="guo_25"):
    # path = f"E:\dati_moody\ownership_history\links_{year}\*.parquet"
    print(f"{country} - {year}...")
    path = f"{PATH}{year}\*.parquet"
        
    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            firmographics_sub.nuts2 AS subsidiary_nuts2,
            firmographics_sub.nace_rev_2_core_code_4_digits_ AS subsidiary_nace4,
            firmographics_guo.nuts2 AS guo_nuts2,
            firmographics_guo.nace_rev_2_core_code_4_digits_ AS guo_nace4,
            {year} AS year
        FROM 
            '{path}' AS main
        LEFT JOIN 
            '{additional_data_path}' AS firmographics_sub
        ON 
            main.subsidiary_bvd_id = firmographics_sub.bvd_id_number
        LEFT JOIN
            '{additional_data_path}' AS firmographics_guo
        ON
            main.guo_25 = firmographics_guo.bvd_id_number
        WHERE 
            main."type_of_relation" = 'GUO 25'
            AND main."{filter_type}" LIKE '{country}%'
    """

    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    return df


def convert_to_stata(df, output_path, country, year):
    print(f"Converting  to Stata...")
    df.to_stata(f"{output_path}/{country}_{year}.dta", write_index=False)

@time_it
def get_data_and_save_to_stata(year, country, output_path, filter_type):
    df = get_ownership_data(year, country, filter_type="guo_25")
    convert_to_stata(df, output_path, country, year)
    return df


year = '2007'
country = "IT"
output_path = r"E:\dati_moody\stata_files\subs_eu"


df = get_data_and_save_to_stata(
    year=year,
    country=country,
    output_path=output_path,
    filter_type="subsidiary_bvd_id"
)

df

  path = f"{PATH}{year}\*.parquet"


IT - 2007...
Converting  to Stata...
Execution time: 43.2184 seconds


Unnamed: 0,subsidiary_bvd_id,guo_25,subsidiary_nuts2,subsidiary_nace4,guo_nuts2,guo_nace4,year
0,IT04104040284,IT04104040284,ITH3 - Veneto,6820,ITH3 - Veneto,6820,2007
1,IT02402480343,IT02402480343,ITH5 - Emilia-Romagna,7010,ITH5 - Emilia-Romagna,7010,2007
2,IT01899940173,IT01899940173,ITC4 - Lombardia,2910,ITC4 - Lombardia,2910,2007
3,IT04394120960,IT04394120960,ITC4 - Lombardia,6630,ITC4 - Lombardia,6630,2007
4,ESG60126679,IT00811720580,ES51 - Cataluna,3511,ITI4 - Lazio,3511,2007
...,...,...,...,...,...,...,...
133330,GB02318452,ITP003777431,,,,,2007
133331,GB02209267,ITPPGBRPRB50B13Z114A,,,,,2007
133332,PL*220214441,ITPPFDNVGN46D04L682Z,,,,,2007
133333,PL*220376884,IT*220521948,,,,,2007


In [8]:
errors = {}

output_path = r"E:\\dati_moody\stata_files\\guos_eu"
filter_type = "guo_25"

for country in eu27_countries:
    for year in range(2007, 2023):
        year = str(year)
        try:
            get_data_and_save_to_stata(
                year=year, 
                country=country,
                filter_type=filter_type,
                output_path=output_path
                )
            print(f"Done {country} - {year}...")
        except Exception as e:
            errors[f"{country} - {year}"] = str(e)
            print(f"Error {country} - {year}...")

# save errors in json format
import json
with open("errors.json", "w") as f:
    json.dump(errors, f)

AT - 2007...
Converting  to Stata...
Execution time: 21.7697 seconds
Done AT - 2007...
AT - 2008...
Converting  to Stata...
Execution time: 45.6040 seconds
Done AT - 2008...
AT - 2009...
Converting  to Stata...
Execution time: 48.0494 seconds
Done AT - 2009...
AT - 2010...
Converting  to Stata...
Execution time: 60.0391 seconds
Done AT - 2010...
AT - 2011...
Converting  to Stata...
Execution time: 53.9097 seconds
Done AT - 2011...
AT - 2012...
Converting  to Stata...
Execution time: 64.4313 seconds
Done AT - 2012...
AT - 2013...
Converting  to Stata...
Execution time: 68.5903 seconds
Done AT - 2013...
AT - 2014...
Converting  to Stata...
Execution time: 82.1772 seconds
Done AT - 2014...
AT - 2015...
Converting  to Stata...
Execution time: 97.0278 seconds
Done AT - 2015...
AT - 2016...
Converting  to Stata...
Execution time: 100.6338 seconds
Done AT - 2016...
AT - 2017...
Converting  to Stata...
Execution time: 113.6821 seconds
Done AT - 2017...
AT - 2018...
Converting  to Stata...
Exec

In [None]:
# open errors
with open("errors.json", "r") as f:
    errors = json.load(f)

errors

In [None]:
# path = r"E:\dati_moody\ownership_history\links_2022\*.parquet"

# columns = '"subsidiary_bvd_id", "guo_25", "bvd_id_number", "country_iso_code", "nuts2"'

# country = "IT"

# query = f"""
#     SELECT
#         {columns},
#         2022 AS year
#     FROM 
#         '{path}' AS main
#     INNER JOIN 
#         '{nuts_path}' AS nuts
#     ON 
#         main.subsidiary_bvd_id = nuts.bvd_id_number
#     WHERE 
#         main."type_of_relation" = 'GUO 25'
#         AND main."subsidiary_bvd_id" LIKE '{country}%'
# """


# # Execute the query
# conn = duckdb.connect()
# df = conn.execute(query).fetchdf()

# df

## Firmographics

In [92]:

query = f"""
    SELECT *
    FROM 
        '{nuts_path}'
"""

# Execute the query
conn = duckdb.connect()
df = conn.execute(query).fetchdf()

In [None]:
df

In [None]:
query = f"""
    SELECT *
    FROM 
        '{nuts_path}'
"""

# Execute the query
conn = duckdb.connect()
df = conn.execute(query).fetchdf()

## Key Financials

In [12]:
key = pd.read_parquet("key_financials\key_financials_eur\part-00000-d5e63738-ec0f-45f1-9e6e-e90c27a22308-c000.snappy.parquet")

  key = pd.read_parquet("key_financials\key_financials_eur\part-00000-d5e63738-ec0f-45f1-9e6e-e90c27a22308-c000.snappy.parquet")


In [48]:
#key[['bvd_id_number', 'number_of_employees']]
key['year'] = key['bvd_id_number'] + key['consolidation_code'] + key['filing_type'] + str(key['closing_date'])

In [None]:
# look for duplicates

key

Unnamed: 0,bvd_id_number,consolidation_code,filing_type,closing_date,number_of_months,audit_status,accounting_practice,source_for_publicly_quoted_companies_,original_units,original_currency,...,shareholders_funds,current_ratio_x_,profit_margin_,roe_using_p_l_before_tax_,roce_using_p_l_before_tax_,solvency_ratio_asset_based_,price_earning_ratio_x_,number_of_employees,market_capitalisation_mil_,year
0,CN9463468227,LF,Local registry filing,2021-12-31,12,,,,units,CNY,...,,,,,,,,1.0,,CN9463468227LFLocal registry filing0 2...
1,CN9463468187,LF,Local registry filing,2022-12-31,12,,,,units,CNY,...,,,,,,,,2.0,,CN9463468187LFLocal registry filing0 2...
2,CN9463468049,LF,Local registry filing,2018-12-31,12,,,,units,CNY,...,-765.0,,,,,-17.65,,1.0,,CN9463468049LFLocal registry filing0 2...
3,CN9463467866,LF,Local registry filing,2018-12-31,12,,,,units,CNY,...,,,,,,,,27.0,,CN9463467866LFLocal registry filing0 2...
4,CN9463467865,LF,Local registry filing,2018-12-31,12,,,,units,CNY,...,,,,,,,,85.0,,CN9463467865LFLocal registry filing0 2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5012475,RU56826278,U1,Local registry filing,2016-12-31,12,,Local GAAP,,thousands,RUB,...,163564.0,,,-0.71,0.0,58.63,,2.0,,RU56826278U1Local registry filing0 202...
5012476,RU56826278,U1,Local registry filing,2015-12-31,12,,Local GAAP,,thousands,RUB,...,128183.0,,,-0.43,0.0,57.01,,2.0,,RU56826278U1Local registry filing0 202...
5012477,RU56826278,U1,Local registry filing,2014-12-31,12,,Local GAAP,,thousands,RUB,...,177693.0,,,-0.21,0.0,67.94,,2.0,,RU56826278U1Local registry filing0 202...
5012478,RU56849919N,LF,Local registry filing,2015-12-31,12,,,,units,RUB,...,113.0,1.01,2.44,411.11,,0.77,,2.0,,RU56849919NLFLocal registry filing0 20...


In [None]:
#str(key[['year']].iloc[0])[-50:]

'gistry filing0         2...\nName: 0, dtype: object'

In [50]:
key[['closing_date']]

Unnamed: 0,closing_date
0,2021-12-31
1,2022-12-31
2,2018-12-31
3,2018-12-31
4,2018-12-31
...,...
5012475,2016-12-31
5012476,2015-12-31
5012477,2014-12-31
5012478,2015-12-31


In [None]:
"""
e ricorda che: A financial year of accounts  is identified by the key: BVDID + Consolidation code + Filing type + Closing date. Up to 4 accounts can be delivered by company by year
Financials are available for a maximum period of 10 years

There are 3 tables (all with the same structure) in the key_financials database i.e.
1) key_financials (values are denominated in Original currency)
2) key_financials_usd (values are denominated in US dollar)
3) key_financials_eur (values are denominated in Euro)
"""