In [1]:
# List all .gz files in 2024 and 2025 directories and collect metadata

import os
from pathlib import Path
import pandas as pd

dirs = [
    "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /DATA/2024",
    "/Users/tusharjoshi/Desktop/ProjectWorkAll/Dissertation /DATA/2025"
]

files = []
for d in dirs:
    for f in Path(d).glob("*.gz"):
        files.append({
            "filename": f.name,
            "path": str(f),
            "size_mb": round(f.stat().st_size / (1024**2), 2),
            "date": f.name.split('-prusa.gz')[0]  # Assumes filename format YYYY-MM-DD-prusa.gz
        })

df_files = pd.DataFrame(files)
print(f"Total files found: {len(df_files)}")
print(f"Date range: {df_files['date'].min()} to {df_files['date'].max()}")
print(df_files.head())

Total files found: 331
Date range: 2024-04-03 to 2025-04-29
              filename                                               path  \
0  2024-11-18-prusa.gz  /Users/tusharjoshi/Desktop/ProjectWorkAll/Diss...   
1  2024-11-19-prusa.gz  /Users/tusharjoshi/Desktop/ProjectWorkAll/Diss...   
2  2024-06-22-prusa.gz  /Users/tusharjoshi/Desktop/ProjectWorkAll/Diss...   
3  2024-06-23-prusa.gz  /Users/tusharjoshi/Desktop/ProjectWorkAll/Diss...   
4  2024-05-11-prusa.gz  /Users/tusharjoshi/Desktop/ProjectWorkAll/Diss...   

   size_mb        date  
0     2.08  2024-11-18  
1     2.21  2024-11-19  
2     1.69  2024-06-22  
3     1.68  2024-06-23  
4     1.71  2024-05-11  


<h3>Batch processing and sampling</h3>

In [None]:
import pandas as pd
import gzip
import json

def process_file(filepath):
    records = []
    with gzip.open(filepath, 'rt', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    record = json.loads(line.strip())
                    flat = {
                        'timestamp': record.get('date'),
                        'printer_id': record.get('id'),
                        'status': record.get('check')
                    }
                    if 'data' in record and isinstance(record['data'], dict):
                        flat.update(record['data'])
                    records.append(flat)
                except:
                    continue
    return pd.DataFrame(records)

# Get file paths from your file listing DataFrame
file_paths = df_files['path'].tolist()

batch_size = 10
dfs = []
for i in range(0, len(file_paths), batch_size):
    batch_files = file_paths[i:i+batch_size]
    for f in batch_files:
        dfs.append(process_file(f))
    print(f"Processed batch {i//batch_size + 1}")

# Concatenate all batches
df_all = pd.concat(dfs, ignore_index=True)
print(df_all.shape)
print(df_all.head())
print(df_all.tail())

Processed batch 1
Processed batch 2
Processed batch 3
Processed batch 4
Processed batch 5
Processed batch 6
Processed batch 7
Processed batch 8
Processed batch 9
Processed batch 10
Processed batch 11
Processed batch 12
Processed batch 13
Processed batch 14
Processed batch 15
Processed batch 16
Processed batch 17
Processed batch 18
Processed batch 19
Processed batch 20
Processed batch 21
Processed batch 22
Processed batch 23
Processed batch 24
Processed batch 25
Processed batch 26
Processed batch 27
Processed batch 28
Processed batch 29
Processed batch 30
Processed batch 31
Processed batch 32
Processed batch 33
Processed batch 34
(94869699, 16)
                  timestamp           printer_id   status data     state  \
0  2024-11-18T00:00:00.590Z  CZPX1522X017XC78087  success   {}  FINISHED   
1  2024-11-18T00:00:00.594Z  CZPX4521X017XC64043  success   {}      IDLE   
2  2024-11-18T00:00:00.597Z  CZPX1622X017XC78384  success   {}      IDLE   
3  2024-11-18T00:00:00.600Z  CZPX1522X017XC7

: 

In [None]:
# df_all.to_csv("all_prusa_data.csv", index=False)

In [None]:
print("Unique dates in files:", sorted(df_files['date'].unique()))

In [None]:
print("Date range in extracted data:", df_all['timestamp'].min(), "to", df_all['timestamp'].max())
df_all['date_only'] = pd.to_datetime(df_all['timestamp']).dt.date
print("Unique dates in extracted data:", sorted(df_all['date_only'].dropna().unique()))

<h3>Data Cleaning and validation </h3>

In [None]:
# Check for missing values
print(df_all.isnull().sum())

# Check data types
print(df_all.dtypes)

# Check for duplicate rows
print("Duplicate rows:", df_all.duplicated().sum())

# Show a sample of rows with missing values
print(df_all[df_all.isnull().any(axis=1)].head())