<h3>Finding files</h3>

In [1]:
from pathlib import Path

# Your data directories
data_dirs = [
    "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /DATA/2024",
    "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /DATA/2025"
]

# Find all .gz files and get their sizes
files_info = []

for directory in data_dirs:
    for file_path in Path(directory).glob("*.gz"):
        size_mb = file_path.stat().st_size / (1024 * 1024)
        files_info.append({
            "file_name": file_path.name,
            "full_path": str(file_path),
            "size_mb": round(size_mb, 2)
        })

# Sort by size (smallest first)
files_info.sort(key=lambda x: x["size_mb"])

# Show results
print(f"Total files found: {len(files_info)}")
print("\n5 smallest files:")
for i, file_info in enumerate(files_info[:5]):
    print(f"{i+1}. {file_info['file_name']} - {file_info['size_mb']} MB")

print("\n5 largest files:")
for i, file_info in enumerate(files_info[-5:]):
    print(f"{i+1}. {file_info['file_name']} - {file_info['size_mb']} MB")

Total files found: 331

5 smallest files:
1. 2024-08-04-prusa.gz - 0.0 MB
2. 2024-08-02-prusa.gz - 0.0 MB
3. 2024-08-03-prusa.gz - 0.0 MB
4. 2024-06-02-prusa.gz - 0.0 MB
5. 2024-06-03-prusa.gz - 0.0 MB

5 largest files:
1. 2025-03-06-prusa.gz - 2.67 MB
2. 2024-09-26-prusa.gz - 2.74 MB
3. 2024-09-25-prusa.gz - 2.78 MB
4. 2024-04-05-prusa.gz - 2.88 MB
5. 2024-04-04-prusa.gz - 3.0 MB


<h3>Processing one test file</h3>

In [5]:
%pip install polars

import polars as pl
import gzip
import json

# Let's use a medium-sized file for testing
test_file = "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /DATA/2025/2025-03-06-prusa.gz"

print(f"Testing file: 2025-03-06-prusa.gz (2.67 MB)")

# Simple approach: read a few lines first to see structure
print("\nFirst 3 lines of the file:")
with gzip.open(test_file, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 3:  # Only show first 3 lines
            break
        if line.strip():
            try:
                parsed = json.loads(line.strip())
                print(f"Line {i+1}: {parsed}")
            except:
                print(f"Line {i+1}: Error parsing JSON")

Collecting polars
  Using cached polars-1.31.0-cp39-abi3-macosx_10_12_x86_64.whl.metadata (14 kB)
Using cached polars-1.31.0-cp39-abi3-macosx_10_12_x86_64.whl (34.5 MB)
Installing collected packages: polars
Successfully installed polars-1.31.0
Note: you may need to restart the kernel to use updated packages.



The following required CPU features were not detected:
    avx, avx2, fma, bmi1, bmi2, lzcnt, movbe
Continuing to use this version of Polars on this processor will likely result in a crash.
Install the `polars-lts-cpu` package instead of `polars` to run Polars with better compatibility.

Hint: If you are on an Apple ARM machine (e.g. M1) this is likely due to running Python under Rosetta.
It is recommended to install a native version of Python that does not run under Rosetta x86-64 emulation.




Testing file: 2025-03-06-prusa.gz (2.67 MB)

First 3 lines of the file:
Line 1: {'date': '2025-03-06T00:00:00.769Z', 'id': 'CZPX1522X017XC78087', 'check': 'success', 'data': {'state': 'IDLE', 'tempBed': 25.4, 'targetBed': 0, 'tempNozzle': 25.1, 'targetNozzle': 0, 'axisZ': 20.2, 'axisX': 170, 'axisY': 170, 'flow': 95, 'speed': 100, 'fanHotend': 0, 'fanPrint': 0}}
Line 2: {'date': '2025-03-06T00:00:00.771Z', 'id': 'CZPX1622X017XC78491', 'check': 'success', 'data': {'state': 'IDLE', 'tempBed': 24, 'targetBed': 0, 'tempNozzle': 23.5, 'targetNozzle': 0, 'axisZ': 20.4, 'axisX': 170, 'axisY': 170, 'flow': 95, 'speed': 100, 'fanHotend': 0, 'fanPrint': 0}}
Line 3: {'date': '2025-03-06T00:00:00.772Z', 'id': 'CZPX1622X017XC78384', 'check': 'success', 'data': {'state': 'PRINTING', 'tempBed': 27.2, 'targetBed': 0, 'tempNozzle': 214.7, 'targetNozzle': 215, 'axisZ': 39.5, 'flow': 100, 'speed': 100, 'fanHotend': 4811, 'fanPrint': 0}}
