# Merge ZIP CSV Files into SQLite Database and Split by `measure_name`

This script processes multiple ZIP files, each containing a CSV file with the same name as the ZIP file.
It extracts the CSV files, merges their data into a single DataFrame, and writes the combined data into an SQLite database.
The data is split into multiple tables based on the unique values in the `measure_name` column.

## Dependencies
- Python 3.10.8
- Libraries: os, zipfile, pandas, sqlite3, multiprocessing

## Usage
1. Place this notebook in the same directory as the `data/download` folder containing the ZIP files.
2. Run all cells in the notebook.
3. A SQLite database named `merged_data.db` will be created in the current directory.

## Output
- A SQLite database file (`merged_data.db`) containing multiple tables, one for each unique value in the `measure_name` column.
- Each table is named after the corresponding `measure_name` value.

In [1]:
import os
import zipfile
import pandas as pd
import sqlite3
from multiprocessing import Pool, cpu_count

# Define relative paths
zip_folder = os.path.join("../data", "download")  # Folder containing ZIP files
db_path = "merged_data.db"  # SQLite database file path (created in the current directory)

# Function to process a single ZIP file and return its DataFrame
def process_zip_file(zip_file_name):
    try:
        zip_file_path = os.path.join(zip_folder, zip_file_name)
        
        # Open the ZIP file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Get the CSV file name (assumed to be the same as the ZIP file name)
            csv_file_name = os.path.splitext(zip_file_name)[0] + ".csv"
            
            # Check if the CSV file exists in the ZIP
            if csv_file_name in zip_ref.namelist():
                # Extract the CSV file to a temporary directory
                extracted_path = zip_ref.extract(csv_file_name, path="temp")
                
                # Read the CSV file into a DataFrame
                df = pd.read_csv(extracted_path)
                
                # Delete the temporary file
                os.remove(extracted_path)
                print(f"Processed: {zip_file_name}")
                return df
            else:
                print(f"Warning: No corresponding CSV file found in {zip_file_name}")
                return None
    except Exception as e:
        print(f"Error processing {zip_file_name}: {e}")
        return None

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Ensure the 'temp' directory exists
if not os.path.exists("temp"):
    os.makedirs("temp")

# Get all ZIP files
zip_files = [f for f in os.listdir(zip_folder) if f.endswith(".zip")]

# Set the pool size (default is the number of CPU cores, but not exceeding the number of ZIP files)
pool_size = min(cpu_count(), len(zip_files))
print(f"Using {pool_size} processes for parallel processing...")

# Create a process pool and process files in parallel
with Pool(pool_size) as pool:
    results = pool.map(process_zip_file, zip_files)

# Combine all DataFrames into one
combined_df = pd.concat([df for df in results if df is not None], ignore_index=True)

# Clean up the temporary directory
if os.path.exists("temp"):
    os.rmdir("temp")

print("All ZIP files have been processed. Combined data ready for splitting.")

# %%
# Split the combined DataFrame by `measure_name` and write to SQLite
conn = sqlite3.connect(db_path)

# Get unique values in the `measure_name` column
unique_measures = combined_df["measure_name"].unique()

# Create a table for each unique `measure_name`
for measure in unique_measures:
    # Filter data for the current `measure_name`
    measure_df = combined_df[combined_df["measure_name"] == measure]
    
    # Clean the table name (replace spaces and special characters with underscores)
    table_name = f"measure_{measure.lower().replace(' ', '_').replace('-', '_')}"
    
    # Write the data to the SQLite database
    measure_df.to_sql(table_name, conn, if_exists="replace", index=False)
    print(f"Created table: {table_name}")

# Close the database connection
conn.close()

print("All data has been split and written to the database.")