In [2]:
import os
import polars as pl
import glob # Import glob to find multiple files

# --- Configuration ---
# IMPORTANT: Update this path to the folder containing your job list .txt files.
# This folder should contain '2024-prusa-job-log.txt' and '2025-prusa-job-log.txt'.
job_list_folder_path = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /DATA/" 

# Define the final order of columns for the job list DataFrame
job_list_final_order = ["date", "id", "name", "size", "mTimestamp"]

# --- Main Processing Logic ---
# Find all .txt files in the specified folder
all_job_list_files = sorted(glob.glob(os.path.join(job_list_folder_path, "*.txt")))

if not all_job_list_files:
    print(f"No .txt files found in {job_list_folder_path}. Please check the path and file extensions. Exiting.")
else:
    print(f"Found {len(all_job_list_files)} job list files to process.")
    for file_path in all_job_list_files:
        print(f"  - {os.path.basename(file_path)}")

    # Read all NDJSON files into a single Polars DataFrame
    # Polars will infer initial types, then we'll explicitly cast 'date'.
    df_job_list_processed = pl.read_ndjson(all_job_list_files)

    # Parse the 'date' column from string to actual Datetime objects.
    # 'strict=False' means if a date can't be parsed, it becomes NULL instead of crashing.
    df_job_list_processed = df_job_list_processed.with_columns(
        pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%Z", strict=False).alias("date")
    )

    # Select and ensure final column order.
    # This also handles cases where columns might be missing by filling with nulls if needed,
    # though for job list files, we expect all columns to be present.
    df_job_list_processed = df_job_list_processed.select(job_list_final_order)

    print(f"\nFinished processing job list files. Combined DataFrame has {df_job_list_processed.shape[0]:,} rows and {len(df_job_list_processed.columns)} columns.")
    # The 'df_job_list_processed' DataFrame now holds your cleaned job list data.
    # We will use this DataFrame in the next step.


Found 2 job list files to process.
  - 2024-prusa-job-log.txt
  - 2025-prusa-job-log.txt

Finished processing job list files. Combined DataFrame has 1,408 rows and 5 columns.


In [3]:

import pymysql
from sqlalchemy import create_engine, text # Used for robust connection

# This code block assumes 'df_job_list_processed' is available from the previous Step 1.
# If you are running this in a new session, you must run Step 1's code first.

# --- Configuration ---
# MySQL Database Connection Details (Same as before)
DB_HOST = "localhost"
DB_USER = "root"       # Your MySQL root username
DB_PASSWORD = "admintushar15" # Replace with your MySQL root password
DB_NAME = "printer_data_db" # The database name you created in DBeaver
JOB_LIST_TABLE_NAME = "JobList" # New table name for the job list data

# --- Main Export Process to MySQL ---
if 'df_job_list_processed' not in locals():
    print("Error: 'df_job_list_processed' DataFrame not found. Please run Step 1 first.")
else:
    try:
        # Connect to MySQL using pymysql
        conn = pymysql.connect(host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME)
        cursor = conn.cursor()

        print(f"\nExporting job list data to MySQL database '{DB_NAME}' table '{JOB_LIST_TABLE_NAME}'...")

        # Drop table if it exists to ensure a clean export
        cursor.execute(f"DROP TABLE IF EXISTS `{JOB_LIST_TABLE_NAME}`;")
        
        # Create table schema in MySQL based on Polars DataFrame columns and types
        # Map Polars types to MySQL types
        type_mapping = {
            pl.String: "TEXT",
            pl.Int64: "BIGINT", # Use BIGINT for large integers like size and mTimestamp
            pl.Datetime: "DATETIME(6)" # DATETIME(6) for microsecond precision
        }
        columns_sql_defs = []
        for col_name, dtype in df_job_list_processed.schema.items():
            mysql_type = type_mapping.get(dtype, "TEXT") 
            columns_sql_defs.append(f'`{col_name}` {mysql_type}') 
        
        create_table_sql = f"CREATE TABLE `{JOB_LIST_TABLE_NAME}` ({', '.join(columns_sql_defs)});"
        cursor.execute(create_table_sql)
        print(f"Created table '{JOB_LIST_TABLE_NAME}' in MySQL.")

        # Prepare the INSERT statement
        placeholders = ", ".join(["%s" for _ in df_job_list_processed.columns]) # Use %s for pymysql placeholders
        insert_sql = f"INSERT INTO `{JOB_LIST_TABLE_NAME}` VALUES ({placeholders});"

        # Iterate over the Polars DataFrame in batches and insert
        batch_size = 1000 # Smaller batch size for job list as it's smaller overall
        total_inserted_rows = 0
        for batch_start in range(0, df_job_list_processed.shape[0], batch_size):
            batch_end = min(batch_start + batch_size, df_job_list_processed.shape[0])
            df_batch_to_insert = df_job_list_processed.slice(batch_start, batch_end - batch_start)
            
            # Convert Polars batch to a list of tuples for pymysql.executemany
            rows_to_insert = []
            for row_data in df_batch_to_insert.iter_rows():
                formatted_row = []
                for col_val, col_name in zip(row_data, df_job_list_processed.columns): 
                    if isinstance(col_val, pl.Datetime):
                        formatted_row.append(col_val.to_pydatetime()) # Convert Polars Datetime to Python datetime
                    else:
                        formatted_row.append(col_val)
                rows_to_insert.append(tuple(formatted_row))

            cursor.executemany(insert_sql, rows_to_insert)
            conn.commit() # Commit after each batch
            total_inserted_rows += len(rows_to_insert)
            print(f"  Inserted {len(rows_to_insert)} rows. Total inserted: {total_inserted_rows}")

        print(f"Successfully exported all job list data to MySQL table '{JOB_LIST_TABLE_NAME}'.")

    except Exception as e:
        print(f"\nAn error occurred during the MySQL export process: {e}")
        print("Please check:")
        print("- MySQL server is running and accessible.")
        print("- Database name, username, and password are correct.")
        print("- You have sufficient privileges to create/write to the table.")

    finally:
        if 'conn' in locals() and conn.open: 
            conn.close()
            print("MySQL connection closed.")


Exporting job list data to MySQL database 'printer_data_db' table 'JobList'...
Created table 'JobList' in MySQL.
  Inserted 1000 rows. Total inserted: 1000
  Inserted 408 rows. Total inserted: 1408
Successfully exported all job list data to MySQL table 'JobList'.
MySQL connection closed.


In [2]:
import pymysql

# --- Configuration ---
# IMPORTANT: Update these with your MySQL server details
DB_HOST = "localhost"
DB_USER = "root"       # Your MySQL root username
DB_PASSWORD = "admintushar15" # Replace with your MySQL root password
DB_NAME = "printer_data_db" # The database name you created

MAIN_PRINTER_DATA_TABLE = "PrinterData" # Your 94M row table in MySQL
JOB_LIST_TABLE = "JobList"             # Your 1.4K row table in MySQL
JOINED_TABLE_NAME = "PrinterData_with_Jobs" # New table name for the joined data

# --- SQL Query to Create the Joined Table ---
# This query links each sensor record to the most recent job that started on that printer
# at or before the sensor reading's timestamp.
# It benefits greatly from the 'idx_joblist_id_date' index on JobList.
create_joined_table_sql = f"""
CREATE TABLE `{JOINED_TABLE_NAME}` AS
SELECT
    P.*, -- Selects all columns from the PrinterData table
    J.name AS JobName, -- Selects the job name and renames it to JobName
    J.size AS JobSize, -- Selects the job size and renames it to JobSize
    J.mTimestamp AS JobmTimestamp -- Selects the job mTimestamp and renames it to JobmTimestamp
FROM
    `{MAIN_PRINTER_DATA_TABLE}` AS P -- Start with the main sensor data table, aliased as P
LEFT JOIN `{JOB_LIST_TABLE}` AS J -- Join with the job list table, aliased as J
    ON P.id = J.id -- Condition 1: Join only if the printer IDs match
    AND J.date = ( -- Condition 2: Join if the job date is the MAX date that meets criteria
        SELECT MAX(J2.date) -- Find the latest job date (MAX)
        FROM `{JOB_LIST_TABLE}` AS J2 -- From the JobList table, aliased as J2
        WHERE J2.id = P.id -- Where the job is for the same printer as the sensor record
          AND J2.date <= P.date -- AND the job started at or before the sensor record's date
    );
"""

# --- Main Process ---
# Connect to MySQL
conn = pymysql.connect(host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME)
cursor = conn.cursor()

print(f"Creating joined table '{JOINED_TABLE_NAME}' in '{DB_NAME}'...")

# Drop the joined table if it exists (ensures a clean start)
cursor.execute(f"DROP TABLE IF EXISTS `{JOINED_TABLE_NAME}`;")

# Execute the SQL query to create and populate the joined table
# This is the command that will take a long time to run on the MySQL server.
cursor.execute(create_joined_table_sql)
conn.commit() # Save the changes to the database

print(f"Successfully created and populated joined table '{JOINED_TABLE_NAME}'.")

# Optional: Verify row count of the new joined table
cursor.execute(f"SELECT COUNT(*) FROM `{JOINED_TABLE_NAME}`;")
joined_row_count = cursor.fetchone()[0]
print(f"Total rows in '{JOINED_TABLE_NAME}': {joined_row_count:,}")

# Close the MySQL connection
cursor.close()
conn.close()
print("MySQL connection closed.")


Creating joined table 'PrinterData_with_Jobs' in 'printer_data_db'...


OperationalError: (2013, 'Lost connection to MySQL server during query')