In [4]:
import os
import polars as pl
import sqlite3

# --- Configuration ---
# IMPORTANT: Update this path to your actual folder containing Parquet files.
parquet_folder = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /processed_float" 

# This is where your new SQLite database file will be created.
output_sqlite_db = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /printer_data.sqlite" 
table_name_in_sqlite = "PrinterData" # Name of the table inside the SQLite database
batch_size = 100000 # Define a reasonable batch size for insertion (e.g., 100,000 rows)

# --- Define the Target Schema for Polars (to ensure consistency) ---
target_schema = {
    "check": pl.String,
    "date": pl.String, 
    "id": pl.String,    
    "state": pl.String, 
    "tempBed": pl.Float64,
    "targetBed": pl.Float64,
    "tempNozzle": pl.Float64,
    "targetNozzle": pl.Float64,
    "axisZ": pl.Float64,
    "axisX": pl.Float64,
    "axisY": pl.Float64,
    "flow": pl.Float64,
    "speed": pl.Float64,
    "fanHotend": pl.Float64,
    "fanPrint": pl.Float64,
}

# This defines the exact order of columns for the final DataFrame.
final_column_order = [
    "check", "date", "id", "state", "tempBed", "targetBed", "tempNozzle", 
    "targetNozzle", "axisZ", "axisX", "axisY", "flow", "speed", 
    "fanHotend", "fanPrint"
]

# --- Main Export Process to SQLite ---
try:
    # List all parquet files in the folder and sort them
    all_parquet_files = sorted([
        os.path.join(parquet_folder, f)
        for f in os.listdir(parquet_folder)
        if f.endswith(".parquet")
    ])
    print(f"Found {len(all_parquet_files)} parquet files to export to SQLite.")

    if not all_parquet_files:
        print("No parquet files found in the specified folder. Nothing to export.")
    else:
        processed_dfs = []
        print("Processing each parquet file individually, enforcing schema and order...")
        
        for i, parquet_file_name in enumerate(sorted(os.listdir(parquet_folder))):
            if not parquet_file_name.endswith(".parquet"):
                continue # Skip non-parquet files
            
            parquet_file_path = os.path.join(parquet_folder, parquet_file_name)
            print(f"  Reading and processing batch {i+1}/{len(os.listdir(parquet_folder))}: {parquet_file_name}")
            
            df_batch = pl.read_parquet(parquet_file_path)
            
            # Ensure all columns from final_column_order are present and in the correct type
            for col_name in final_column_order:
                if col_name not in df_batch.columns:
                    df_batch = df_batch.with_columns(pl.lit(None, dtype=target_schema[col_name]).alias(col_name))
                else:
                    df_batch = df_batch.with_columns(pl.col(col_name).cast(target_schema[col_name]))
            
            df_batch = df_batch.select(final_column_order)
            processed_dfs.append(df_batch)

        print("Concatenating all processed DataFrames...")
        df_combined = pl.concat(processed_dfs)
        print(f"Combined DataFrame loaded successfully with {df_combined.shape[0]} rows.")

        # Re-parse the 'date' column to actual datetime objects
        df_combined = df_combined.with_columns(
            pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%Z", strict=False).alias("date")
        )

        # Export the DataFrame to SQLite in batches
        print(f"Exporting data to SQLite database: {output_sqlite_db} in batches...")
        with sqlite3.connect(output_sqlite_db) as conn:
            cursor = conn.cursor()
            # Manually drop the table if it exists to simulate if_exists="replace"
            cursor.execute(f"DROP TABLE IF EXISTS {table_name_in_sqlite};")
            
            # Corrected line: Iterate over df_combined.schema.items() to get name and dtype
            type_mapping = {
                pl.String: "TEXT",
                pl.Float64: "REAL",
                pl.Datetime: "TEXT" # SQLite stores datetime as TEXT by default
            }
            columns_and_types = ", ".join([f'"{name}" {type_mapping.get(dtype, "TEXT")}' for name, dtype in df_combined.schema.items()])
            create_table_sql = f"CREATE TABLE {table_name_in_sqlite} ({columns_and_types});"
            cursor.execute(create_table_sql)
            print(f"Created table '{table_name_in_sqlite}' in SQLite.")

            # Prepare the INSERT statement
            placeholders = ", ".join(["?" for _ in final_column_order])
            insert_sql = f"INSERT INTO {table_name_in_sqlite} VALUES ({placeholders});"

            # Iterate over the Polars DataFrame in batches and insert
            total_inserted_rows = 0
            for batch_start in range(0, df_combined.shape[0], batch_size):
                batch_end = min(batch_start + batch_size, df_combined.shape[0])
                df_batch_to_insert = df_combined.slice(batch_start, batch_end - batch_start)
                
                # Convert Polars batch to a list of tuples for sqlite3.executemany
                rows_to_insert = []
                for row_data in df_batch_to_insert.iter_rows():
                    formatted_row = []
                    for col_val, col_name in zip(row_data, final_column_order):
                        if isinstance(col_val, pl.Datetime):
                            formatted_row.append(col_val.isoformat()) # Convert datetime to ISO string
                        else:
                            formatted_row.append(col_val)
                    rows_to_insert.append(tuple(formatted_row))

                cursor.executemany(insert_sql, rows_to_insert)
                conn.commit() # Commit after each batch
                total_inserted_rows += len(rows_to_insert)
                print(f"  Inserted {len(rows_to_insert)} rows. Total inserted: {total_inserted_rows}")

except Exception as e:
    print(f"\nAn error occurred during the SQLite export process: {e}")
    print("Please check:")
    print(f"- The 'parquet_folder' path: {parquet_folder}")
    print(f"- The 'output_sqlite_db' path: {output_sqlite_db}")
    print("- That you have enough disk space for the SQLite file.")
    print("- That the column names in your Parquet files match the 'target_schema' defined.")

Found 34 parquet files to export to SQLite.
Processing each parquet file individually, enforcing schema and order...
  Reading and processing batch 1/34: batch_001.parquet
  Reading and processing batch 2/34: batch_002.parquet
  Reading and processing batch 3/34: batch_003.parquet
  Reading and processing batch 4/34: batch_004.parquet
  Reading and processing batch 5/34: batch_005.parquet
  Reading and processing batch 6/34: batch_006.parquet
  Reading and processing batch 7/34: batch_007.parquet
  Reading and processing batch 8/34: batch_008.parquet
  Reading and processing batch 9/34: batch_009.parquet
  Reading and processing batch 10/34: batch_010.parquet
  Reading and processing batch 11/34: batch_011.parquet
  Reading and processing batch 12/34: batch_012.parquet
  Reading and processing batch 13/34: batch_013.parquet
  Reading and processing batch 14/34: batch_014.parquet
  Reading and processing batch 15/34: batch_015.parquet
  Reading and processing batch 16/34: batch_016.parq