# Merge ZIP CSV Files into SQLite Databases and Split by `measure_name`, `metric_name`, and `sex_name`

## Dependencies
- Python 3.10.8
- Libraries: `os`, `zipfile`, `pandas`, `multiprocessing`

In [1]:
import os
import zipfile
import pandas as pd
from multiprocessing import Pool, cpu_count

# Define relative paths
zip_folder = os.path.join("../data", "download")
output_folder = "../data/database"

# Function to process a single ZIP file and return its DataFrame
def process_zip_file(zip_file_name):
    try:
        zip_file_path = os.path.join(zip_folder, zip_file_name)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            csv_file_name = os.path.splitext(zip_file_name)[0] + ".csv"
            if csv_file_name in zip_ref.namelist():
                extracted_path = zip_ref.extract(csv_file_name, path="temp")
                df = pd.read_csv(extracted_path)
                os.remove(extracted_path)
                # Drop columns with `_id` in their names
                df = df.drop(columns=[col for col in df.columns if "_id" in col])
                # Drop cause_name
                df = df.drop(columns='cause_name')
                return df
            else:
                print(f"Warning: No CSV found in {zip_file_name}")
                return None
    except Exception as e:
        print(f"Error processing {zip_file_name}: {e}")
        return None
    
# Function to write a single measure's data to a database file
def write_measure_to_db(measure_data):
    measure_name, metric_name, sex_name, measure_df, output_folder = measure_data

    # Remove content within parentheses from measure_name
    measure_name_cleaned = measure_name.split('(')[0].strip().lower().replace(' ', '_').replace('-', '_')

    # Clean metric_name and sex_name
    metric_name_cleaned = metric_name.lower().replace(' ', '_').replace('-', '_')
    sex_name_cleaned = sex_name.lower().replace(' ', '_').replace('-', '_')

    # Generate the database file name
    file_name = f"{measure_name_cleaned}_{metric_name_cleaned}_{sex_name_cleaned}.csv"
    file_path = os.path.join(output_folder, file_name)

    # Write the data to csv file
    measure_df.to_csv(file_path, index=False)

    return file_name

In [2]:
# create temp folder to store extracted files
if not os.path.exists("temp"):
    os.makedirs("temp")

# Process all ZIP files in parallel
zip_files = [f for f in os.listdir(zip_folder) if f.endswith(".zip")]
pool_size = min(cpu_count(), len(zip_files))
print(f"Using {pool_size} processes...")

with Pool(pool_size) as pool:
    results = pool.map(process_zip_file, zip_files)

combined_df = pd.concat([df for df in results if df is not None], ignore_index=True)

if os.path.exists("temp"):
    os.rmdir("temp")

Using 49 processes...


In [3]:
# Split the combined DataFrame by `measure_name`, `metric_name`, and `sex_name`
unique_measures = combined_df["measure_name"].unique()
measure_data_list = []

for measure in unique_measures:
    measure_df = combined_df[combined_df["measure_name"] == measure]
    unique_metrics = measure_df["metric_name"].unique()
    for metric in unique_metrics:
        metric_df = measure_df[measure_df["metric_name"] == metric]
        unique_sexes = metric_df["sex_name"].unique()
        for sex in unique_sexes:
            sex_df = metric_df[metric_df["sex_name"] == sex]
            measure_data_list.append((measure, metric, sex, sex_df, output_folder))

# Set the pool size (default is the number of CPU cores)
pool_size = min(cpu_count(), len(measure_data_list))
print(f"Using {pool_size} processes to write csv files...")

# Create a process pool and write csv files in parallel
with Pool(pool_size) as pool:
    pool.map(write_measure_to_db, measure_data_list)

print("All data written to individual csv files.")

Using 54 processes to write csv files...
All data written to individual csv files.


# Get global and regional data

In [4]:
import requests

# URL to fetch the settings from
setting_url = "https://vizhub.healthdata.org/gbd-results/php/metadata/?language=en"

# Function to fetch and process the data
def fetch_settings(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()

        data = response.json()
        return data

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

# Function to extract names from setting
def get_regional_names(url):
  # Extract group names from settings_data
  setting_data = fetch_settings(url)
  group_names = setting_data['data']['groups']

  # Convert group_names to a DataFrame
  group_df = pd.DataFrame(group_names.items(), columns=["group_id", "group_info"])

  # Extract relevant fields from the nested dictionary in 'group_info'
  group_df['group_name'] = group_df['group_info'].apply(lambda x: x.get('name', None))
  group_df['group_type'] = group_df['group_info'].apply(lambda x: x.get('type', None))
  group_df['disabled'] = group_df['group_info'].apply(lambda x: x.get('disabled', False))
  group_df['locations'] = group_df['group_info'].apply(lambda x: x.get('locations', []))

  # Drop the original 'group_info' column
  group_df = group_df.drop(columns=["group_info"])

  return group_df

# Getting regional names
regional_names = get_regional_names(setting_url)

regional_names.head()

Unnamed: 0,group_id,group_name,group_type,disabled,locations
0,g-1,GBD super regions,superregion,False,[]
1,g-2,GBD regions,region,False,[]
2,g-3,Subnational,admin0,True,[]
3,custom,Custom groups,,True,[]
4,44587,African Union,,False,"[94365, 248, 254, 261, 263]"


In [11]:
# get global data
global_df = combined_df[combined_df["location_name"] == "Global"]

# get regional data: location_name in regional_names
regional_df = combined_df[combined_df["location_name"].isin(regional_names['group_name'])]

# get SDI data: location_name contains "SDI"
sdi_df = combined_df[combined_df["location_name"].str.contains("SDI", case=False)]

# combine global, regional, and SDI data
combined_df = pd.concat([global_df, regional_df, sdi_df], ignore_index=True)

# Splict the combined DataFrame by `metric_name`
for metric in unique_metrics:
    metric_df = combined_df[combined_df["metric_name"] == metric]
    file_name = f"global_regional_{metric.lower().replace(' ', '_').replace('-', '_')}.csv"
    file_path = os.path.join(output_folder, file_name)
    metric_df.to_csv(file_path, index=False)