In [None]:
import os
import polars as pl
from tableauhyperapi import HyperProcess, Connection, TableDefinition, SqlType, Inserter, CreateMode, TableName

# --- Configuration ---
# IMPORTANT: Update this path to the actual folder containing your Parquet files.
parquet_folder = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /processed_float" 

# This is where your new .hyper file will be created.
output_hyper_file = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /new_printer_hyper_file.hyper" 



Setup complete. Ready to define schema.


In [None]:
# --- Define the Table Schema and Name ---
# The table will be named "PrinterData" within the "Extract" schema.

table_name = TableName("Extract", "PrinterData")

# Define each column's name and its data type for the Hyper file.
# We're explicitly setting 'date' to SqlType.timestamp() for proper date/time handling.

printer_table_definition = TableDefinition(table_name)
printer_table_definition.add_column("check", SqlType.text())
printer_table_definition.add_column("date", SqlType.timestamp()) 
printer_table_definition.add_column("id", SqlType.text())
printer_table_definition.add_column("state", SqlType.text())
printer_table_definition.add_column("tempBed", SqlType.double())
printer_table_definition.add_column("targetBed", SqlType.double())
printer_table_definition.add_column("tempNozzle", SqlType.double())
printer_table_definition.add_column("targetNozzle", SqlType.double())
printer_table_definition.add_column("axisZ", SqlType.double())
printer_table_definition.add_column("axisX", SqlType.double())
printer_table_definition.add_column("axisY", SqlType.double())
printer_table_definition.add_column("flow", SqlType.double())
printer_table_definition.add_column("speed", SqlType.double())
printer_table_definition.add_column("fanHotend", SqlType.double())
printer_table_definition.add_column("fanPrint", SqlType.double())

print("Table schema defined. Ready to list Parquet files.")

Table schema defined. Ready to list Parquet files.


In [3]:
# --- List Parquet Files ---
all_parquet_files = sorted([
    os.path.join(parquet_folder, f)
    for f in os.listdir(parquet_folder)
    if f.endswith(".parquet")
])
print(f"Found {len(all_parquet_files)} parquet files to export.")

if not all_parquet_files:
    print("No parquet files found in the specified folder. Nothing to export.")

Found 34 parquet files to export.


In [4]:
# --- Start Hyper Process and Create File ---
# This block will only run if parquet files were found.
if all_parquet_files:
    # Start the Hyper process. (Using telemetry=0 for compatibility with your version)
    hyper_process = HyperProcess(telemetry=0)
    connection = None # Initialize connection to None

    try:
        # Connect to the Hyper file. CreateMode.CREATE_AND_REPLACE will create a new file
        # or overwrite an existing one, ensuring a clean start.
        connection = Connection(hyper_process.endpoint, output_hyper_file, CreateMode.CREATE_AND_REPLACE)
        
        # Create the schema (e.g., "Extract") if it doesn't exist
        connection.catalog.create_schema(table_name.schema_name)
        # Create the table using the defined schema
        connection.catalog.create_table(printer_table_definition)
        print(f"Created table '{table_name}' in '{output_hyper_file}'. Starting data insertion...")

    except Exception as e:
        print(f"Error during Hyper process or connection setup: {e}")
        # Ensure resources are closed even if an error occurs here
        if connection:
            connection.close()
        hyper_process.close()
        # Re-raise the exception to stop execution if setup failed
        raise

Created table '"Extract"."PrinterData"' in '/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /new_printer_hyper_file.hyper'. Starting data insertion...


In [5]:
# --- Insert Data from Parquet Files ---
# This block assumes the previous block ran successfully and 'connection' is open.
if all_parquet_files:
    total_rows_inserted = 0
    for i, parquet_file in enumerate(all_parquet_files):
        print(f"  Processing batch {i+1}/{len(all_parquet_files)}: {os.path.basename(parquet_file)}")

        # Read a single Parquet file into a Polars DataFrame
        df = pl.read_parquet(parquet_file)

        # Prepare rows for insertion.
        # It's important that the order of data in 'row_data' matches the column order
        # defined in 'printer_table_definition'.
        rows_to_insert = []
        column_names_in_order = [col.name for col in printer_table_definition.columns]
        for row_dict in df.to_dicts():
            ordered_row = tuple(row_dict.get(col_name) for col_name in column_names_in_order)
            rows_to_insert.append(ordered_row)

        # Use Inserter to add rows efficiently.
        # The 'with Inserter' block ensures data is flushed.
        with Inserter(connection, printer_table_definition) as inserter:
            inserter.add_rows(rows_to_insert)
        
        total_rows_inserted += len(rows_to_insert)
        print(f"  Inserted {len(rows_to_insert)} rows. Total inserted: {total_rows_inserted}")

    print(f"\nSuccessfully inserted {total_rows_inserted} rows to '{output_hyper_file}'.")

  Processing batch 1/34: batch_001.parquet
  Inserted 3331416 rows. Total inserted: 3331416
  Processing batch 2/34: batch_002.parquet
  Inserted 2805908 rows. Total inserted: 6137324
  Processing batch 3/34: batch_003.parquet
  Inserted 3005163 rows. Total inserted: 9142487
  Processing batch 4/34: batch_004.parquet
  Inserted 3005002 rows. Total inserted: 12147489
  Processing batch 5/34: batch_005.parquet
  Inserted 2223424 rows. Total inserted: 14370913
  Processing batch 6/34: batch_006.parquet
  Inserted 2292976 rows. Total inserted: 16663889
  Processing batch 7/34: batch_007.parquet
  Inserted 3004799 rows. Total inserted: 19668688
  Processing batch 8/34: batch_008.parquet
  Inserted 3004960 rows. Total inserted: 22673648
  Processing batch 9/34: batch_009.parquet
  Inserted 3005002 rows. Total inserted: 25678650
  Processing batch 10/34: batch_010.parquet
  Inserted 3005135 rows. Total inserted: 28683785
  Processing batch 11/34: batch_011.parquet
  Inserted 2150302 rows. Tot

In [6]:
# --- Close Connection and Hyper Process ---
# This ensures the .hyper file is properly finalized.
if connection:
    connection.close()
    print("Hyper connection closed.")
if hyper_process:
    hyper_process.close()
    print("Hyper process closed.")

print("\nExport process finished.")

Hyper connection closed.
Hyper process closed.

Export process finished.


In [1]:
import os
import polars as pl
from tableauhyperapi import HyperProcess, Connection, TableDefinition, SqlType, Inserter, CreateMode, TableName

# --- Configuration ---
# IMPORTANT: Update this path to the actual folder containing your Parquet files.
parquet_folder = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /processed_float" 

# This is where your new SMALL .hyper file will be created.
output_small_hyper_file = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /test_small_printer_data.hyper" 

# --- Define the Table Schema and Name (same as before) ---
table_name = TableName("Extract", "PrinterData")

printer_table_definition = TableDefinition(table_name)
printer_table_definition.add_column("check", SqlType.text())
printer_table_definition.add_column("date", SqlType.timestamp()) 
printer_table_definition.add_column("id", SqlType.text())
printer_table_definition.add_column("state", SqlType.text())
printer_table_definition.add_column("tempBed", SqlType.double())
printer_table_definition.add_column("targetBed", SqlType.double())
printer_table_definition.add_column("tempNozzle", SqlType.double())
printer_table_definition.add_column("targetNozzle", SqlType.double())
printer_table_definition.add_column("axisZ", SqlType.double())
printer_table_definition.add_column("axisX", SqlType.double())
printer_table_definition.add_column("axisY", SqlType.double())
printer_table_definition.add_column("flow", SqlType.double())
printer_table_definition.add_column("speed", SqlType.double())
printer_table_definition.add_column("fanHotend", SqlType.double())
printer_table_definition.add_column("fanPrint", SqlType.double())

# --- Main Export Process for a small test file ---
hyper_process = None
connection = None
try:
    # Get just the first parquet file for a small test
    all_parquet_files = sorted([
        os.path.join(parquet_folder, f)
        for f in os.listdir(parquet_folder)
        if f.endswith(".parquet")
    ])
    
    if not all_parquet_files:
        print("No parquet files found. Cannot perform test export.")
    else:
        first_parquet_file = all_parquet_files[0]
        print(f"Attempting to export a small sample from: {os.path.basename(first_parquet_file)}")

        # Start the Hyper process
        hyper_process = HyperProcess(telemetry=0) 

        # Connect to the Hyper file (create/replace)
        connection = Connection(hyper_process.endpoint, output_small_hyper_file, CreateMode.CREATE_AND_REPLACE)
        
        # Create schema and table
        connection.catalog.create_schema(table_name.schema_name)
        connection.catalog.create_table(printer_table_definition)
        print(f"Created table '{table_name}' in '{output_small_hyper_file}'. Starting data insertion...")

        # Read only a small portion of the first parquet file
        df_small = pl.read_parquet(first_parquet_file).head(100) # Read first 100 rows
        
        rows_to_insert = []
        column_names_in_order = [col.name for col in printer_table_definition.columns]
        for row_dict in df_small.to_dicts():
            ordered_row = tuple(row_dict.get(col_name) for col_name in column_names_in_order)
            rows_to_insert.append(ordered_row)

        with Inserter(connection, printer_table_definition) as inserter:
            inserter.add_rows(rows_to_insert)
        
        print(f"  Inserted {len(rows_to_insert)} rows into '{output_small_hyper_file}'.")

finally:
    # Ensure connection and hyper process are closed, even if errors occur
    if connection:
        connection.close()
        print("Hyper connection closed.")
    if hyper_process:
        hyper_process.close()
        print("Hyper process closed.")

print("\nSmall test export process finished.")

Attempting to export a small sample from: batch_001.parquet
Created table '"Extract"."PrinterData"' in '/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /test_small_printer_data.hyper'. Starting data insertion...
  Inserted 100 rows into '/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /test_small_printer_data.hyper'.
Hyper connection closed.
Hyper process closed.

Small test export process finished.


In [5]:
import os
import polars as pl

# --- Configuration ---
# IMPORTANT: Update this path to your actual folder containing Parquet files.
parquet_folder = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /processed_float" 

# This is where your new CSV file will be created.
output_csv_file = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /printer_data_full.csv" 

# --- Define the Target Schema and Final Column Order ---
# This schema defines the data type for each column.
target_schema = {
    "check": pl.String,
    "date": pl.String, # Read as string, then parse to datetime
    "id": pl.String,    # Ensure ID can be null or string
    "state": pl.String, # Ensure State can be null or string
    "tempBed": pl.Float64,
    "targetBed": pl.Float64,
    "tempNozzle": pl.Float64,
    "targetNozzle": pl.Float64,
    "axisZ": pl.Float64,
    "axisX": pl.Float64,
    "axisY": pl.Float64,
    "flow": pl.Float64,
    "speed": pl.Float64,
    "fanHotend": pl.Float64,
    "fanPrint": pl.Float64,
}

# This defines the exact order of columns for the final DataFrame.
# It's crucial that all columns in target_schema are also listed here.
final_column_order = [
    "check", "date", "id", "state", "tempBed", "targetBed", "tempNozzle", 
    "targetNozzle", "axisZ", "axisX", "axisY", "flow", "speed", 
    "fanHotend", "fanPrint"
]

# --- Main Export Process ---
try:
    # List all parquet files in the folder and sort them
    all_parquet_files = sorted([
        os.path.join(parquet_folder, f)
        for f in os.listdir(parquet_folder)
        if f.endswith(".parquet")
    ])
    print(f"Found {len(all_parquet_files)} parquet files to export to CSV.")

    if not all_parquet_files:
        print("No parquet files found in the specified folder. Nothing to export.")
    else:
        processed_dfs = []
        print("Processing each parquet file individually, enforcing schema and order...")
        
        for i, parquet_file in enumerate(all_parquet_files):
            print(f"  Reading and processing batch {i+1}/{len(all_parquet_files)}: {os.path.basename(parquet_file)}")
            
            # Read the parquet file
            df_batch = pl.read_parquet(parquet_file)
            
            # Ensure all columns from final_column_order are present and in the correct type
            # Add missing columns with nulls, and cast existing ones
            for col_name in final_column_order:
                if col_name not in df_batch.columns:
                    # Add missing column with nulls and correct type
                    df_batch = df_batch.with_columns(pl.lit(None, dtype=target_schema[col_name]).alias(col_name))
                else:
                    # Cast existing column to the target type
                    df_batch = df_batch.with_columns(pl.col(col_name).cast(target_schema[col_name]))
            
            # Select and reorder columns to match final_column_order
            df_batch = df_batch.select(final_column_order)
            
            processed_dfs.append(df_batch)
        
        # Concatenate all processed DataFrames into one
        print("Concatenating all processed DataFrames...")
        df_combined = pl.concat(processed_dfs)
        print(f"Combined DataFrame loaded successfully with {df_combined.shape[0]} rows.")

        # Re-parse the 'date' column to actual datetime objects, handling errors by setting to null
        df_combined = df_combined.with_columns(
            pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%Z", strict=False).alias("date")
        )

        # Export the DataFrame to a single CSV file
        print(f"Exporting data to CSV: {output_csv_file}...")
        df_combined.write_csv(output_csv_file)
        print(f"Successfully exported all data to '{output_csv_file}'.")

except Exception as e:
    print(f"\nAn error occurred during the CSV export process: {e}")
    print("Please check:")
    print(f"- The 'parquet_folder' path: {parquet_folder}")
    print(f"- The 'output_csv_file' path: {output_csv_file}")
    print("- That you have enough disk space for the CSV file.")
    print("- That the column names in your Parquet files are consistent enough to be mapped to the 'target_schema'.")

Found 34 parquet files to export to CSV.
Processing each parquet file individually, enforcing schema and order...
  Reading and processing batch 1/34: batch_001.parquet
  Reading and processing batch 2/34: batch_002.parquet
  Reading and processing batch 3/34: batch_003.parquet
  Reading and processing batch 4/34: batch_004.parquet
  Reading and processing batch 5/34: batch_005.parquet
  Reading and processing batch 6/34: batch_006.parquet
  Reading and processing batch 7/34: batch_007.parquet
  Reading and processing batch 8/34: batch_008.parquet
  Reading and processing batch 9/34: batch_009.parquet
  Reading and processing batch 10/34: batch_010.parquet
  Reading and processing batch 11/34: batch_011.parquet
  Reading and processing batch 12/34: batch_012.parquet
  Reading and processing batch 13/34: batch_013.parquet
  Reading and processing batch 14/34: batch_014.parquet
  Reading and processing batch 15/34: batch_015.parquet
  Reading and processing batch 16/34: batch_016.parquet