In [12]:
import gzip
import re
import pandas as pd
import matplotlib.pyplot as plt

In [45]:
log_file_path = "log.txt.gz"
output_file = "processed_logs.gz"

# Regular expression to extract fields
log_pattern = re.compile(
    r"Struct\{series_id=([\w]+\s?[\w]+),location_id=(\d+),meter_id=(\d+),time=([\d-]+\s[\d:.]+),value=(\d+),value_status=([\w]+),insert_time=([\d-]+\s[\d:.]+),extract_time=([\d-]+\s[\d:.]+)\}"
)

chunk_size = 1_000_000
data = []

In [46]:
with gzip.open(log_file_path, "rt") as file, gzip.open(output_file, "wt") as out:
    out.write(
        "series_id,location_id,meter_id,time,value,value_status,insert_time,extract_time\n"
    )

    for i, line in enumerate(file):
        match = log_pattern.search(line)
        if match:
            (
                series_id,
                location_id,
                meter_id,
                time,
                value,
                value_status,
                insert_time,
                extract_time,
            ) = match.groups()
            print(
                f"{series_id},{location_id},{meter_id},{time},{value},{value_status},{insert_time},{extract_time}"
            )
            data.append(
                f"{series_id},{location_id},{meter_id},{time},{value},{value_status},{insert_time},{extract_time}\n"
            )
        else:
            print(f"match not found {line}")
        if (i + 1) % chunk_size == 0:
            out.writelines(data)
            data.clear()
            print(f"Processed {i+1} lines...")

    if data:
        out.writelines(data)

print(f"Processing complete. Results saved to {output_file}")

voltage 1,11,101,2023-11-25 12:00:00.0,230,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
voltage 2,11,101,2023-11-25 12:00:00.0,220,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
voltage 3,11,101,2023-11-25 12:00:00.0,330,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
p14 l1,11,101,2023-11-25 12:00:00.0,20,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
a14,11,101,2023-11-25 12:00:00.0,1000,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
p14 l2,11,101,2023-11-25 12:00:00.0,30,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
p14 l3,11,101,2023-11-25 12:00:00.0,25,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
p14 l1,11,211,2023-11-25 12:00:00.0,10,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
p14 l2,11,211,2023-11-25 12:00:00.0,15,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
p14 l3,11,211,2023-11-25 12:00:00.0,17,Correct,2023-11-25 12:00:00.0,2023-11-25 12:00:00.0
Processing complete. Results saved to processed_logs.gz


In [53]:
#proc_file = "processed_logs.gz"
proc_file = "data/generated_event.csv.gz"

name_counts = {}
name_stats = {}

for chunk in pd.read_csv(
    proc_file, compression="gzip", parse_dates=["time"], chunksize=chunk_size
):

    chunk["value"] = pd.to_numeric(chunk["value"], errors="coerce")
    chunk = chunk.dropna(subset=["value"])

    for name, count in chunk["series_id"].value_counts().items():
        name_counts[name] = name_counts.get(name, 0) + count

    for name, group in chunk.groupby("series_id")["value"]:
        if name not in name_stats:
            name_stats[name] = {
                "min": float("inf"),
                "max": float("-inf"),
                "sum": 0,
                "count": 0,
            }
        name_stats[name]["min"] = min(name_stats[name]["min"], group.min())
        name_stats[name]["max"] = max(name_stats[name]["max"], group.max())
        name_stats[name]["sum"] += group.sum()
        name_stats[name]["count"] += len(group)

# Convert aggregated stats to a DataFrame
stats_df = pd.DataFrame.from_dict(name_stats, orient="index")
stats_df["mean"] = stats_df["sum"] / stats_df["count"]
stats_df.drop(columns=["sum", "count"], inplace=True)

# Convert name counts to DataFrame
name_counts_df = pd.DataFrame(name_counts.items(), columns=["series_id", "count"])

# Save results
stats_df.to_csv("value_trends_by_name.csv")
name_counts_df.to_csv("name_frequencies.csv")

print("Processing complete. Results saved to:")
print("  - value_trends_by_name.csv")
print("  - name_frequencies.csv")

Processing complete. Results saved to:
  - value_trends_by_name.csv
  - name_frequencies.csv
