# Part 3: Parquet Export

This notebook exports SQL Server data to Parquet files for data sharing and archival. This isn't required unless you are sharing the data and don't want to share where you stored it

## Overview
- Export all SQL tables and views to Parquet format
- Handle large datasets by splitting into chunks
- Create ZIP archive of selected data for sharing

## Output
- Parquet files for each table/view
- ZIP archive with selected views and tables


## 1. Setup

In [None]:
import pandas as pd
import os
import zipfile
import time
from sqlalchemy import create_engine, inspect
from tqdm import tqdm

In [None]:
engine = create_engine(
    "mssql+pyodbc://username:password@server.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

In [None]:
# Configuration
OUTPUT_DIR = r"C:\Mod_data"  
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Objects to include in ZIP file for sharing
ZIP_OBJECTS = {'Authors_di', 'Mods_di', 'Games'}

# Maximum rows per parquet file (for splitting large tables)
ROW_LIMIT = 1_000_000

## 2. List Available Tables and Views

In [None]:
# Get list of tables and views
inspector = inspect(engine)
tables = inspector.get_table_names()
views = inspector.get_view_names()

print(f"Tables ({len(tables)}):")
for t in tables:
    print(f"  - {t}")

print(f"\nViews ({len(views)}):")
for v in views:
    print(f"  - {v}")

## 3. Export Functions

In [None]:
def export_and_split(df, name, output_dir, row_limit=ROW_LIMIT):
    '''
    Export DataFrame to Parquet, splitting into multiple files if needed.
    
    Args:
        df: DataFrame to export
        name: Base name for output files
        output_dir: Directory to save files
        row_limit: Maximum rows per file
    
    Returns:
        list: List of created filenames
    '''
    if len(df) == 0:
        return []
    
    chunks = [df[i:i + row_limit] for i in range(0, len(df), row_limit)]
    filenames = []
    
    for idx, chunk in enumerate(chunks, start=1):
        suffix = f"_{idx}" if len(chunks) > 1 else ""
        filename = f"{name}{suffix}.parquet"
        path = os.path.join(output_dir, filename)
        chunk.to_parquet(path, index=False)
        filenames.append(filename)
    
    return filenames

In [None]:
def export_all_objects(tables, views, output_dir):
    '''
    Export all tables and views to Parquet files.
    
    Args:
        tables: List of table names
        views: List of view names
        output_dir: Output directory
    
    Returns:
        list: Export timing results
    '''
    all_objects = tables + views
    print(f"Found {len(tables)} tables and {len(views)} views. Starting export...")
    
    export_times = []
    
    for name in tqdm(all_objects, desc="Exporting SQL objects", unit="object"):
        try:
            start_time = time.time()
            
            # Read data from SQL
            df = pd.read_sql_query(f"SELECT * FROM [{name}]", engine)
            
            # Export to parquet
            files = export_and_split(df, name, output_dir)
            
            duration = time.time() - start_time
            export_times.append((name, len(df), len(files), round(duration, 2)))
            
        except Exception as e:
            print(f"Error exporting '{name}': {e}")
    
    return export_times

In [None]:
def create_zip_archive(output_dir, zip_objects, zip_filename):
    '''
    Create ZIP archive with selected objects.
    
    Args:
        output_dir: Directory containing parquet files
        zip_objects: Set of object names to include
        zip_filename: Name for the ZIP file
    '''
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for obj in zip_objects:
            part_idx = 1
            while True:
                # Handle split files
                if part_idx == 1:
                    filename = f"{obj}.parquet"
                else:
                    filename = f"{obj}_{part_idx}.parquet"
                
                file_path = os.path.join(output_dir, filename)
                
                if os.path.exists(file_path):
                    zipf.write(file_path, arcname=filename)
                    part_idx += 1
                else:
                    break
    
    print(f"ZIP archive created: {zip_filename}")

## 4. Run Export

In [None]:
# Export all objects
export_times = export_all_objects(tables, views, OUTPUT_DIR)
for name, rows, parts, duration in export_times:
    print(f"{name:<30} | {rows:>10,} | {parts:>5} | {duration:>10}")

In [None]:
zip_filename = "NexusModsDB_selected_views_and_table.zip"
create_zip_archive(OUTPUT_DIR, ZIP_OBJECTS, zip_filename)

print(f"\nExport complete!")
print(f"All Parquet files saved in: {OUTPUT_DIR}")
print(f"Selected objects zipped to: {zip_filename}")

## 5. Verify Export

In [None]:
# Read and display sample from exported data
sample_file = os.path.join(OUTPUT_DIR, "Authors_di.parquet")

if os.path.exists(sample_file):
    df_sample = pd.read_parquet(sample_file)
    print(f"Sample from Authors_di.parquet ({len(df_sample)} rows):")
    display(df_sample.head())
else:
    print(f"Sample file not found: {sample_file}")