# Merge ZIP CSV Files into SQLite Database and Split by `measure_name`

This script processes multiple ZIP files, each containing a CSV file with the same name as the ZIP file.
It extracts the CSV files, merges their data into a single DataFrame, and writes the combined data into an SQLite database.
The data is split into multiple tables based on the unique values in the `measure_name` column.

## Dependencies
- Python 3.10.8
- Libraries: os, zipfile, pandas, sqlite3, multiprocessing

## Usage
1. Place this notebook in the same directory as the `data/download` folder containing the ZIP files.
2. Run all cells in the notebook.
3. A SQLite database named `merged_data.db` will be created in the current directory.

## Output
- A SQLite database file (`merged_data.db`) containing multiple tables, one for each unique value in the `measure_name` column.
- Each table is named after the corresponding `measure_name` value.

In [None]:
import os
import zipfile
import pandas as pd
import sqlite3
from multiprocessing import Pool, cpu_count

# Define relative paths
zip_folder = os.path.join("../data", "download")  # Folder containing ZIP files
db_path = "merged_data.db"  # SQLite database file path (created in the current directory)

# Function to process a single ZIP file and return its DataFrame
def process_zip_file(zip_file_name):
    try:
        zip_file_path = os.path.join(zip_folder, zip_file_name)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            csv_file_name = os.path.splitext(zip_file_name)[0] + ".csv"
            if csv_file_name in zip_ref.namelist():
                extracted_path = zip_ref.extract(csv_file_name, path="temp")
                df = pd.read_csv(extracted_path)
                os.remove(extracted_path)
                print(f"Processed: {zip_file_name}")
                # Drop columns with `_id` in their names
                df = df.drop(columns=[col for col in df.columns if "_id" in col])
                return df
            else:
                print(f"Warning: No CSV found in {zip_file_name}")
                return None
    except Exception as e:
        print(f"Error processing {zip_file_name}: {e}")
        return None

In [None]:
# create temp folder to store extracted files
if not os.path.exists("temp"):
    os.makedirs("temp")

# Process all ZIP files in parallel
zip_files = [f for f in os.listdir(zip_folder) if f.endswith(".zip")]
pool_size = min(cpu_count(), len(zip_files))
print(f"Using {pool_size} processes...")

with Pool(pool_size) as pool:
    results = pool.map(process_zip_file, zip_files)

combined_df = pd.concat([df for df in results if df is not None], ignore_index=True)

if os.path.exists("temp"):
    os.rmdir("temp")

conn = sqlite3.connect(db_path)
unique_measures = combined_df["measure_name"].unique()

for measure in unique_measures:
    measure_df = combined_df[combined_df["measure_name"] == measure]
    table_name = f"measure_{measure.lower().replace(' ', '_').replace('-', '_')}"
    measure_df.to_sql(table_name, conn, if_exists="replace", index=False)
    print(f"Created table: {table_name}")

conn.close()
print("All data written to the database.")