# ITALY

In [2]:
import pandas as pd
import duckdb
import os
import time
import numpy as np
import sys
sys.path.insert(0, os.path.abspath(".."))
from utils import time, data_processor, constants

In [3]:
OUTPUT_PATH = r"..\\zipped_files\\italy\\" 
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

## Ownership history

In [4]:
# import duckdb

# TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed\\firmographics_full\\" 
# TEMP_TABLE_KEY_FINANCIALS = "..\\data_processed\\key_financials_detailed_processed\\"
# TEMP_TABLE_LISTED_FINANCIALS = "..\\data_processed\\listed_financials_processed\\"

# def get_ownership_data(year, country, path=None, debug=False):
#     print(f"{country} - {year}...")

#     COUNTRY_KEY_FINANCIALS = TEMP_TABLE_KEY_FINANCIALS + f"key_financials_detailed_{country}.parquet"
#     COUNTRY_FIRMOGRAPHICS = TEMP_TABLE_FIRMOGRAPHICS + f"{country}.parquet"
#     LISTED_FINANCIALS = TEMP_TABLE_LISTED_FINANCIALS + f"listed_financials_{country}.parquet"

#     conn = duckdb.connect()

#     query = f"""
#     SELECT
#         main.*,
#         firmographics.*,
#         key_financials_detailed.*,
#         listed_financials.*
#     FROM
#         read_parquet('{path}') AS main
#     LEFT JOIN
#         read_parquet('{COUNTRY_FIRMOGRAPHICS}') AS firmographics
#     ON
#         main.guo_25 = firmographics.bvd_id_number
#     LEFT JOIN
#         read_parquet('{COUNTRY_KEY_FINANCIALS}') AS key_financials_detailed
#     ON
#         main.guo_25 = key_financials_detailed.bvd_id_number
#         AND key_financials_detailed.year = {year}
#     LEFT JOIN
#         read_parquet('{LISTED_FINANCIALS}') AS listed_financials
#     ON
#         main.guo_25 = listed_financials.bvd_id_number
#     WHERE
#         main.type_of_relation = 'GUO 25'
#         AND main.guo_25 LIKE '{country}%'
#     """

#     df = conn.execute(query).fetchdf()
#     conn.close()
#     return df


In [5]:
import duckdb

TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed\\firmographics_full\\" 
TEMP_TABLE_KEY_FINANCIALS = "..\\data_processed\\key_financials_detailed_processed\\"
TEMP_TABLE_LISTED_FINANCIALS = "..\\data_processed\\listed_financials_processed\\"


def get_ownership_data(year, country, debug=False):
    print(f"{country} - {year}...")

    COUNTRY_KEY_FINANCIALS = TEMP_TABLE_KEY_FINANCIALS + f"key_financials_detailed_{country}.parquet"
    COUNTRY_FIRMOGRAPHICS = TEMP_TABLE_FIRMOGRAPHICS + f"{country}.parquet"
    LISTED_FINANCIALS = TEMP_TABLE_LISTED_FINANCIALS + f"listed_financials_{country}.parquet"

    conn = duckdb.connect()

    query = f"""
    SELECT
        listed_financials.*,
        firmographics.*,
        key_financials_detailed.*
    FROM
        read_parquet('{LISTED_FINANCIALS}') AS listed_financials
    LEFT JOIN
        read_parquet('{COUNTRY_FIRMOGRAPHICS}') AS firmographics
        USING (bvd_id_number)
    LEFT JOIN
        read_parquet('{COUNTRY_KEY_FINANCIALS}') AS key_financials_detailed
        USING (bvd_id_number)
    WHERE
        listed_financials.year = {year}
        AND (key_financials_detailed.year IS NULL OR key_financials_detailed.year = {year})
        AND bvd_id_number LIKE '{country}%'
    """

    if debug:
        print(query)

    df = conn.execute(query).fetchdf()
    conn.close()
    return df


### Query

In [6]:
# test

df = get_ownership_data(
    year=str(2007),
    country='IT',
)


IT - 2007...


In [8]:
df.columns

Index(['bvd_id_number', 'fixed_assets', 'net_assets', 'share_capital',
       'total_assets', 'total_current_assets', 'total_current_liabilities',
       'non_current_liabilities', 'number_of_employees_lf', 'total_revenues',
       'enterprise_value', 'year', 'bvd_id_number_1',
       'nace_rev_2_core_code_4_digits_', 'nuts2', 'nuts3', 'city_native_',
       'status', 'type_of_entity', 'national_legal_form', 'postcode',
       'country_iso_code', 'city', 'region_in_country', 'bvd_id_number_2',
       'operating_revenue_turnover_', 'number_of_employees', 'total_assets_1',
       'profit_margin_', 'costs_of_goods_sold', 'gross_profit',
       'material_costs', 'cash_flow', 'added_value', 'p_l_before_tax',
       'roe_using_p_l_before_tax_', 'roce_using_p_l_before_tax_', 'year_1'],
      dtype='object')

In [12]:
import os

country = "IT"
start_year = 2007
end_year = 2024  

for year in range(start_year, end_year + 1):
    print(f"Processing {country} {year}...")

    df = get_ownership_data(
        year=str(year),
        country=country,
    )

    output_path = os.path.join(OUTPUT_PATH, f"{country}_{year}.csv.gz")
    df.to_csv(output_path, index=False, compression="gzip")

    print(f"Saved to {output_path}")


Processing IT 2007...
IT - 2007...
Saved to ..\data_processed\\italy\\IT_2007.csv.gz
Processing IT 2008...
IT - 2008...
Saved to ..\data_processed\\italy\\IT_2008.csv.gz
Processing IT 2009...
IT - 2009...
Saved to ..\data_processed\\italy\\IT_2009.csv.gz
Processing IT 2010...
IT - 2010...
Saved to ..\data_processed\\italy\\IT_2010.csv.gz
Processing IT 2011...
IT - 2011...
Saved to ..\data_processed\\italy\\IT_2011.csv.gz
Processing IT 2012...
IT - 2012...
Saved to ..\data_processed\\italy\\IT_2012.csv.gz
Processing IT 2013...
IT - 2013...
Saved to ..\data_processed\\italy\\IT_2013.csv.gz
Processing IT 2014...
IT - 2014...
Saved to ..\data_processed\\italy\\IT_2014.csv.gz
Processing IT 2015...
IT - 2015...
Saved to ..\data_processed\\italy\\IT_2015.csv.gz
Processing IT 2016...
IT - 2016...
Saved to ..\data_processed\\italy\\IT_2016.csv.gz
Processing IT 2017...
IT - 2017...
Saved to ..\data_processed\\italy\\IT_2017.csv.gz
Processing IT 2018...
IT - 2018...
Saved to ..\data_processed\\it

## Explore 

In [None]:
df = df.sort_values(by=['bvd_id_number', 'year'])
df.drop

Unnamed: 0,bvd_id_number,fixed_assets,net_assets,share_capital,total_assets,total_current_assets,total_current_liabilities,non_current_liabilities,number_of_employees_lf,total_revenues,...,profit_margin_,costs_of_goods_sold,gross_profit,material_costs,cash_flow,added_value,p_l_before_tax,roe_using_p_l_before_tax_,roce_using_p_l_before_tax_,year_1
3171,IT00050540384,150103000.0,114548000.0,5794000.0,156169000.0,6066000.0,2852000.0,38769000.0,80.0,9461000.0,...,31.02,2608000.0,6853000.0,1848000.0,6782000.0,6589000.0,2935000.0,2.56,2.00,2007
3173,IT00050540384,150103000.0,114548000.0,5794000.0,156169000.0,6066000.0,2852000.0,38769000.0,80.0,9461000.0,...,31.02,2608000.0,6853000.0,1848000.0,6782000.0,6589000.0,2935000.0,2.56,2.00,2007
3175,IT00050540384,150103000.0,114548000.0,5794000.0,156169000.0,6066000.0,2852000.0,38769000.0,80.0,9461000.0,...,31.02,2608000.0,6853000.0,1848000.0,6782000.0,6589000.0,2935000.0,2.56,2.00,2007
3177,IT00050540384,150103000.0,114548000.0,5794000.0,156169000.0,6066000.0,2852000.0,38769000.0,80.0,9461000.0,...,31.02,2608000.0,6853000.0,1848000.0,6782000.0,6589000.0,2935000.0,2.56,2.00,2007
3179,IT00050540384,150103000.0,114548000.0,5794000.0,156169000.0,6066000.0,2852000.0,38769000.0,80.0,9461000.0,...,31.02,2608000.0,6853000.0,1848000.0,6782000.0,6589000.0,2935000.0,2.56,2.00,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5774,IT97579210010,59483000.0,107206000.0,4722000.0,235140000.0,175657000.0,75169000.0,65478000.0,2272.0,279076000.0,...,13.55,101370000.0,177706000.0,6955000.0,19965000.0,174565000.0,37820000.0,40.02,23.78,2007
6176,IT97579210010,59483000.0,107206000.0,4722000.0,235140000.0,175657000.0,75169000.0,65478000.0,2272.0,279076000.0,...,13.55,101370000.0,177706000.0,6955000.0,19965000.0,174565000.0,37820000.0,40.02,23.78,2007
6281,IT97579210010,59483000.0,107206000.0,4722000.0,235140000.0,175657000.0,75169000.0,65478000.0,2272.0,279076000.0,...,13.55,101370000.0,177706000.0,6955000.0,19965000.0,174565000.0,37820000.0,40.02,23.78,2007
6530,IT97579210010,59483000.0,107206000.0,4722000.0,235140000.0,175657000.0,75169000.0,65478000.0,2272.0,279076000.0,...,13.55,101370000.0,177706000.0,6955000.0,19965000.0,174565000.0,37820000.0,40.02,23.78,2007


In [9]:
df[['bvd_id_number', 'bvd_id_number_1', 'bvd_id_number_2']]

Unnamed: 0,bvd_id_number,bvd_id_number_1,bvd_id_number_2
0,IT01487430280,IT01487430280,IT01487430280
1,IT00796400158,IT00796400158,IT00796400158
2,IT00811720580,IT00811720580,IT00811720580
3,IT04245520376,IT04245520376,IT04245520376
4,IT06722600019,IT06722600019,IT06722600019
...,...,...,...
31797,IT31524EI,,IT31524EI
31798,IT31570EI,,IT31570EI
31799,IT31482EI,,IT31482EI
31800,IT31561EI,,IT31561EI


In [11]:

df[['bvd_id_number', 'total_assets', 'total_assets_1', 'year']]
df.to_csv(f"test_ita_2023.csv", index=False)

In [18]:
cols = df.columns.tolist()
cols.sort()
cols

['added_value',
 'bvd_id_number',
 'bvd_id_number_1',
 'bvd_id_number_2',
 'cash_flow',
 'city',
 'city_native_',
 'costs_of_goods_sold',
 'country_iso_code',
 'enterprise_value',
 'fixed_assets',
 'gross_profit',
 'material_costs',
 'nace_rev_2_core_code_4_digits_',
 'national_legal_form',
 'net_assets',
 'non_current_liabilities',
 'number_of_employees',
 'number_of_employees_lf',
 'nuts2',
 'nuts3',
 'operating_revenue_turnover_',
 'p_l_before_tax',
 'postcode',
 'profit_margin_',
 'region_in_country',
 'roce_using_p_l_before_tax_',
 'roe_using_p_l_before_tax_',
 'share_capital',
 'status',
 'total_assets',
 'total_assets_1',
 'total_current_assets',
 'total_current_liabilities',
 'total_revenues',
 'type_of_entity',
 'year',
 'year_1']