In [1]:
import pandas as pd
import numpy as np
import duckdb
import os
import gc
import psutil

import sys
sys.path.insert(0, os.path.abspath("../.."))

from utils import time, data_processor, constants

In [2]:
process = psutil.Process(os.getpid())

def print_memory_usage(prefix=""):
    mem = process.memory_info().rss / (1024 ** 2)  # in MB
    print(f"{prefix}Memory usage: {mem:.2f} MB")


In [3]:
all_countries = set(constants.eu + constants.americas + constants.asia + constants.africa + constants.oceania + constants.balkans)
eu = set(constants.eu)
balkans = set(constants.balkans)

In [4]:
import os
import duckdb

def query_ownership_history_subs(country, year):
    input_path = f"Z:\\dati_moody\\data_raw\\ownership_history\\links_{year}\\*.parquet"
    output_path = f"Z:\\dati_moody\\data_processed\\ownership_history\\subs\\{country}_{year}.parquet"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25
            main.type_of_relation
        FROM
            '{input_path}' AS main
        WHERE 
            main.subsidiary_bvd_id LIKE '{country}%'
        AND
            main."type_of_relation" = 'GUO 25'
        AND
            main.subsidiary_bvd_id IS NOT NULL
    """

    conn = duckdb.connect()
    try:
        conn.execute(f"""
            COPY ({query})
            TO '{output_path}' (FORMAT PARQUET)
        """)
        print(f"Saved {country} - {year} to {output_path}")
    finally:
        conn.close()


In [5]:
def query_ownership_history_guos(country, year):
    input_path = f"Z:\\dati_moody\\data_raw\\ownership_history\\links_{year}\\*.parquet"
    output_path = f"Z:\\dati_moody\\data_processed\\ownership_history\\guos\\{country}_{year}.parquet"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            main.type_of_relation
        FROM
            '{input_path}' AS main
        WHERE 
            main.guo_25 LIKE '{country}%'
        AND
            main."type_of_relation" = 'GUO 25'
        AND
            main.guo_25 IS NOT NULL
    """

    conn = duckdb.connect()
    try:
        conn.execute(f"""
            COPY ({query})
            TO '{output_path}' (FORMAT PARQUET)
        """)
        print(f"Saved {country} - {year} to {output_path}")
    finally:
        conn.close()


In [6]:
todo = ['IT',
         'SI',
         'DE',
         'HR',
         'AT',
         'CY',
         'GR',
         'BG',
         'HU',
         'NL',
         'CN',
         'US',
         'TR',
         'CH',
         'RU',
         'GB',
         'BA',
         'RS',
         'ME',
         'MK',
         'KV',
         'AL'
]

# Cina, usa, turchia, svizera, russia, uk, 

In [10]:
# for country in eu:
for country in ['RU']:
    for year in range(2007, 2024):
        print(f"Processing {country} for year {year}")
        query_ownership_history_guos(country, year)


Processing RU for year 2007
Saved RU - 2007 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2007.parquet
Processing RU for year 2008
Saved RU - 2008 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2008.parquet
Processing RU for year 2009
Saved RU - 2009 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2009.parquet
Processing RU for year 2010
Saved RU - 2010 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2010.parquet
Processing RU for year 2011
Saved RU - 2011 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2011.parquet
Processing RU for year 2012
Saved RU - 2012 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2012.parquet
Processing RU for year 2013
Saved RU - 2013 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2013.parquet
Processing RU for year 2014
Saved RU - 2014 to Z:\dati_moody\data_processed\ownership_history\guos\RU_2014.parquet
Processing RU for year 2015
Saved RU - 2015 to Z:\dati_moody\data_processed\owne

In [None]:
# for country in eu:
for country in ['CH', 'RS', 'GB']:
    for year in range(2007, 2024):
        print(f"Processing {country} for year {year}")
        query_ownership_history_guos(country, year)


In [None]:
def query_ownership_history_guos(country, year):
    input_path = f"Z:\\dati_moody\\data_raw\\ownership_history\\links_{year}\\*.parquet"
    output_path = f"Z:\\dati_moody\\data_processed\\ownership_history\\subs\\{country}_{year}.parquet"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            main.type_of_relation
        FROM
            '{input_path}' AS main
        WHERE 
            main.guo_25 LIKE '{country}%'
        AND
            main."type_of_relation" = 'GUO 25'
        AND
            main.guo_25 IS NOT NULL
    """

    conn = duckdb.connect()
    df = conn.execute(query).df()

    return df

In [None]:
country = 'CN'
year = 2020

df = query_ownership_history_guos(country, year)


In [None]:
df

Unnamed: 0,subsidiary_bvd_id,guo_25,type_of_relation
0,CN9413257419,CN*110276856089,GUO 25
1,CN9413257420,CN*110226613182,GUO 25
2,CN9413257424,CN*110276856093,GUO 25
3,CN9413257431,CN*110252115078,GUO 25
4,CN9413257432,CN*110267714678,GUO 25
...,...,...,...
35706470,CN9461098764,CN*110254431053,GUO 25
35706471,CN9461098772,CN*110277417136,GUO 25
35706472,CN9461098772,CN*110277417136,GUO 25
35706473,CN9461098780,CN9461098780,GUO 25


In [None]:
balkans = ['BA', 'KV', 'AL', 'RS', 'ME', 'MK']

df['subsidiary_country'] = df['subsidiary_bvd_id'].str[:2]
df['is_balkan'] = df['subsidiary_country'].isin(balkans)

In [None]:
df

Unnamed: 0,subsidiary_bvd_id,guo_25,type_of_relation,subsidiary_country,is_balkan
0,CN9413257419,CN*110276856089,GUO 25,CN,False
1,CN9413257420,CN*110226613182,GUO 25,CN,False
2,CN9413257424,CN*110276856093,GUO 25,CN,False
3,CN9413257431,CN*110252115078,GUO 25,CN,False
4,CN9413257432,CN*110267714678,GUO 25,CN,False
...,...,...,...,...,...
35706470,CN9461098764,CN*110254431053,GUO 25,CN,False
35706471,CN9461098772,CN*110277417136,GUO 25,CN,False
35706472,CN9461098772,CN*110277417136,GUO 25,CN,False
35706473,CN9461098780,CN9461098780,GUO 25,CN,False


In [None]:
df.is_balkan.value_counts()

is_balkan
False    35701923
True         4552
Name: count, dtype: int64