# CONVERT FILES WITH DUCK_DB

In [1]:
import pandas as pd
import duckdb
import os
import time
import numpy as np
import sys
sys.path.insert(0, os.path.abspath(".."))
from utils import time, data_processor, constants

In [2]:
OUTPUT_PATH = r"..\\data_processed\\guo_subs_world\\guos_china\\" 
country = 'CN'

## Define query

In [3]:
TEMP_TABLE_FIRMOGRAPHICS = "..\\data_processed\\firmographics_processed"
TEMP_TABLE_KEY_FINANCIALS_DETAILED = "..\\data_processed\\key_financials_detailed_processed"

def get_ownership_data(year, country, path=None):
    print(f"{country} - {year}...")

    COUNTRY_FIRMOGRAPHICS = TEMP_TABLE_FIRMOGRAPHICS + f"\\firmographics_{country}.parquet"
    COUNTRY_KEY_FINANCIALS = TEMP_TABLE_KEY_FINANCIALS_DETAILED + f"\\key_financials_detailed_{country}.parquet"

    query = f"""
        SELECT
            main.subsidiary_bvd_id,
            main.guo_25,
            {year} AS year,

            firmographics_guo.nuts2 AS guo_nuts2,
            firmographics_guo.nace_rev_2_core_code_4_digits_ AS guo_nace4,
            firmographics_guo.type_of_entity AS guo_type_of_entity,
            firmographics_guo.status AS guo_status,
            firmographics_guo.postcode AS guo_postcode,
            firmographics_guo.country_iso_code,
            firmographics_guo.city as guo_city,
            firmographics_guo.region_in_country as guo_region_in_country,

            firmographics_sub.nuts2 AS subsidiary_nuts2,
            firmographics_sub.nace_rev_2_core_code_4_digits_ AS subsidiary_nace4,
            firmographics_sub.type_of_entity AS subsidiary_type_of_entity,
            firmographics_sub.postcode AS subsidiary_postcode,
            firmographics_sub.country_iso_code AS subsidiary_country_iso_code,
            firmographics_sub.city AS subsidiary_city,
            firmographics_sub.region_in_country AS subsidiary_region_in_country,

            key_financials_detailed.number_of_employees as guo_number_of_employees,
            
        FROM 
            '{path}' AS main

        LEFT JOIN 
            '{COUNTRY_KEY_FINANCIALS}' AS key_financials_detailed
        ON 
            main.guo_25 = key_financials_detailed.bvd_id_number
        AND 
            key_financials_detailed.year = {year}

        LEFT JOIN 
            '{COUNTRY_FIRMOGRAPHICS}' AS firmographics_sub
        ON 
            main.subsidiary_bvd_id = firmographics_sub.bvd_id_number

        LEFT JOIN 
            '{COUNTRY_FIRMOGRAPHICS}' AS firmographics_guo
        ON 
            main.guo_25 = firmographics_guo.bvd_id_number

        WHERE 
            main."type_of_relation" = 'GUO 25'
        AND 
            main."guo_25" LIKE '{country}%'
    """

    conn = duckdb.connect()
    df = conn.execute(query).fetchdf()
    return df


In [4]:
def split_in_chunks(df, n_chunks):
    indices = np.array_split(df.index, n_chunks)
    slices = [df.iloc[idx] for idx in indices]
    return slices

@time.time_it
def fetch_and_convert_to_excel(year, country, path, output_path):
    df = get_ownership_data(year, country, path)
    os.makedirs(output_path, exist_ok=True)
    slices = data_processor.split_in_chunks(df, 5)

    for i, s in enumerate(slices):
        s.to_csv(f"{output_path}\\{country}_{year}_chunk_{i+1}.csv", index=False)
        print(f"Data for {country} - {year} - slice_{i+1} has been converted.")

### Test

In [5]:
country = "CN"
year = "2012"
path = "..\\data_raw\\ownership_history\\links_2017\\*.parquet"

df = get_ownership_data(
    path=path,
    year=year,
    country=country,
)

CN - 2012...


In [6]:
df.columns

Index(['subsidiary_bvd_id', 'guo_25', 'year', 'guo_nuts2', 'guo_nace4',
       'guo_type_of_entity', 'guo_status', 'guo_postcode', 'country_iso_code',
       'guo_city', 'guo_region_in_country', 'subsidiary_nuts2',
       'subsidiary_nace4', 'subsidiary_type_of_entity', 'subsidiary_postcode',
       'subsidiary_country_iso_code', 'subsidiary_city',
       'subsidiary_region_in_country', 'guo_number_of_employees'],
      dtype='object')

In [7]:
# count missing
df["guo_nace4"].isna().sum()/df.shape[0] * 100

np.float64(86.68060690120716)

In [8]:
df.to_csv('test_china.csv', index=False)

In [12]:
import time
import json

errors = {}

if not os.path.exists(OUTPUT_PATH):
    print(f"Folder does not exist: {OUTPUT_PATH}")
    os.makedirs(OUTPUT_PATH, exist_ok=True)

total_start = time.time()
start_time = time.time()

# for country in asia+oceania+
for country in ['CN']:
    for year in range(2018, 2019):
        year = str(year)
        try:
            path = f"..\\data_raw\\ownership_history\\links_{year}\\*.parquet"
            fetch_and_convert_to_excel(year, country, path, OUTPUT_PATH)  
            end_time = time.time()
            duration = end_time - start_time
            print(f"Data for {country} - {year} has been converted to Stata in {duration:.2f} seconds.")
        except Exception as e:
            errors[f"{country} - {year}"] = str(e)
            print(f"Error {country} - {year}")
            continue


end_time = time.time()
duration = end_time - start_time
total_end = time.time()
total_duration = total_end - total_start
print(f"Total execution time: {total_duration:.2f} seconds.")

with open("errors/errors_guos_world.json", "w") as f:
    json.dump(errors, f)

CN - 2018...
Data for CN - 2018 - slice_1 has been converted.
Data for CN - 2018 - slice_2 has been converted.
Data for CN - 2018 - slice_3 has been converted.
Data for CN - 2018 - slice_4 has been converted.
Data for CN - 2018 - slice_5 has been converted.
Execution time: 1044.1248 seconds
Data for CN - 2018 has been converted to Stata in 1044.14 seconds.
Total execution time: 1044.14 seconds.


In [10]:
# open errors
import json 

with open("errors/errors_guos_world.json", "r") as f:
    errors = json.load(f)

errors

{}

## Zip the files

In [11]:
ZIP_OUTPOUT = "..\\zipped_files\\guos_china"

data_processor.compress_csv_files(folder_to_zip=OUTPUT_PATH, zip_output=ZIP_OUTPOUT)

Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2023_chunk_4.csv to ..\zipped_files\guos_china\CN_2023_chunk_4.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2014_chunk_3.csv to ..\zipped_files\guos_china\CN_2014_chunk_3.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2014_chunk_5.csv to ..\zipped_files\guos_china\CN_2014_chunk_5.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2014_chunk_1.csv to ..\zipped_files\guos_china\CN_2014_chunk_1.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2010_chunk_5.csv to ..\zipped_files\guos_china\CN_2010_chunk_5.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2017_chunk_1.csv to ..\zipped_files\guos_china\CN_2017_chunk_1.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2022_chunk_2.csv to ..\zipped_files\guos_china\CN_2022_chunk_2.csv.gz
Compressed ..\\data_processed\\guo_subs_world\\guos_china\\CN_2023_chunk_2.c