In [4]:
import re
import pandas as pd
from pathlib import Path

# Regex patterns
patterns = {
    "mpi_world_size": re.compile(r"MPI world size:\s+(\d+)"),
    "train_size": re.compile(r"Global train size:\s+(\d+), local sizes:\s+(\d+)"),
    "train_loop_time": re.compile(r"Training Loop===== took ([\d\.]+) sec"),
    "epochs": re.compile(r"Epochs:\s+(\d+), total iters:\s+(\d+)"),
    "train_time": re.compile(r"Training time \(s\):\s+([\d\.]+)"),
    "rmse_train": re.compile(r"RMSE train:\s+([\d\.]+)"),
    "rmse_test": re.compile(r"RMSE test:\s+([\d\.]+)"),
    "run_training_time": re.compile(r"run training took ([\d\.]+) sec"),
    "namespace": re.compile(
        r"batch_size=(\d+), hidden=(\d+), lr=([\d\.]+), activation='(\w+)'"
    ),
}

def parse_log_file(file_path):
    results = {}
    with open(file_path, "r") as f:
        for line in f:
            for key, pattern in patterns.items():
                match = pattern.search(line)
                if match:
                    if key == "namespace":
                        results["batch_size"], results["hidden"], results["lr"], results["activation"] = match.groups()
                    elif key == "epochs":
                        results["epochs"], results["total_iters"] = match.groups()
                    elif key == "train_size":
                        results["global_train_size"], results["local_sizes"] = match.groups()
                    else:
                        results[key] = match.groups()[0] if len(match.groups()) == 1 else match.groups()
    return results

# Read all log files
log_dir = Path("logs/")  # change to your log folder
data = []

for file in log_dir.glob("*.log"):
    parsed = parse_log_file(file)
    parsed["file"] = file.name
    data.append(parsed)

# Convert to DataFrame
df = pd.DataFrame(data)
print(df)

# Save to CSV
df.to_csv("report/log_summary.csv", index=False)


   mpi_world_size global_train_size local_sizes train_loop_time batch_size  \
0               8          25923545     3240444         67.9855       2048   
1               8          25923545     3240444         63.4918        128   
2               8          25923545     3240444         85.5563        512   
3               8          25923545     3240444         93.5340        256   
4               8          25923545     3240444         89.9634       2048   
5               8          25923545     3240444        143.4612       2048   
6               8          25923545     3240444        146.8688        512   
7               8          25923545     3240444        146.8923        256   
8               8          25923545     3240444        142.0473       1024   
9               8          25923545     3240444         96.9161       1024   
10              8          25923545     3240444        145.8775        128   
11              8          25923545     3240444        484.7968 

In [5]:
import re
import pandas as pd
from pathlib import Path

# Regex to match history log lines
history_pattern = re.compile(
    r"Iter\s+(\d+), epoch\s+(\d+), local R\(θ\)=([\d\.]+)"
)

def parse_history(file_path):
    rows = []
    with open(file_path, "r") as f:
        for line in f:
            match = history_pattern.search(line)
            if match:
                iteration, epoch, local_r = match.groups()
                rows.append({
                    "filename": file_path.name,
                    "epoch": int(epoch),
                    "Iter": int(iteration),
                    "local_R_theta": float(local_r)
                })
    return rows

def main():
    log_dir = Path("logs/")  # change to your logs folder
    all_rows = []

    for file in log_dir.glob("*.log"):
        all_rows.extend(parse_history(file))

    # Convert to DataFrame
    df = pd.DataFrame(all_rows)

    # Order by filename, epoch, Iter
    df = df.sort_values(by=["filename", "epoch", "Iter"]).reset_index(drop=True)

    print(df.head(20))  # preview
    df.to_csv("report/log_history.csv", index=False)

if __name__ == "__main__":
    main()


                     filename  epoch  Iter  local_R_theta
0   rank0_relu1024_epoch1.log      0   310      12.377288
1   rank0_relu1024_epoch1.log      0   620      10.239742
2   rank0_relu1024_epoch1.log      0   930       9.263220
3   rank0_relu1024_epoch1.log      0  1240       8.698991
4   rank0_relu1024_epoch1.log      0  1550       8.326568
5   rank0_relu1024_epoch1.log      0  1860       8.045514
6   rank0_relu1024_epoch1.log      0  2170       7.869802
7   rank0_relu1024_epoch1.log      0  2480       7.696306
8   rank0_relu1024_epoch1.log      0  2790       7.571086
9   rank0_relu1024_epoch1.log      0  3100       7.472262
10  rank0_relu1024_epoch5.log      0   625      10.217946
11  rank0_relu1024_epoch5.log      0  1250       8.691088
12  rank0_relu1024_epoch5.log      0  1875       8.034294
13  rank0_relu1024_epoch5.log      0  2500       7.682695
14  rank0_relu1024_epoch5.log      0  3125       7.472123
15  rank0_relu1024_epoch5.log      1  3750       8.789116
16  rank0_relu

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Load the extracted CSV from previous script
df = pd.read_csv("report/log_history.csv")

# Plot for each file separately
for filename, group in df.groupby("filename"):
    plt.figure(figsize=(10, 6))
    plt.plot(group["Iter"], group["local_R_theta"], marker="o", linestyle="-")
    
    plt.title(filename)  # chart title
    plt.xlabel("Iter")
    plt.ylabel("local R(θ)")
    plt.grid(True, linestyle="--", alpha=0.6)
    
    # Save each chart
    os.makedirs(os.path.join("charts","history"), exist_ok=True)
    plt.savefig(f"charts/history/{filename}_history.png")
    plt.close()

print("✅ Plots saved as <filename>_history.png")


✅ Plots saved as <filename>_history.png


In [7]:
import pandas as pd
import matplotlib.pyplot as plt

# Load summary data
df = pd.read_csv("report/log_summary.csv")

def plot_metric(df, metric, epoch_filter):
    # Filter by epoch
    data = df[df["epochs"] == epoch_filter]

    plt.figure(figsize=(10, 6))

    # Plot each activation as separate line
    for activation, group in data.groupby("activation"):
        # Sort by batch_size before plotting
        group = group.sort_values("batch_size")

        plt.plot(
            group["batch_size"], 
            group[metric], 
            marker="o", 
            linestyle="-", 
            label=activation
        )

    plt.title(f"{metric} vs batch_size (epochs={epoch_filter})")
    plt.xlabel("Batch Size")
    plt.ylabel(metric.replace("_", " ").title())
    plt.legend(title="Activation")
    plt.grid(True, linestyle="--", alpha=0.6)

    os.makedirs(os.path.join("charts","result"), exist_ok=True)
    plt.savefig(f"charts/result/{metric}_epochs{epoch_filter}.png")
    plt.close()

# Example usage:
for metric in ["rmse_train", "rmse_test", "run_training_time"]:
    plot_metric(df, metric, epoch_filter=1)   # change epoch value as needed


In [8]:
import pandas as pd
import matplotlib.pyplot as plt

# Load summary data
df = pd.read_csv("report/log_summary.csv")

def plot_metric_by_batch(df, metric, batch_size_filter=1024):
    # Filter by batch_size
    data = df[df["batch_size"] == batch_size_filter]

    plt.figure(figsize=(10, 6))

    # Plot each activation as separate line
    for activation, group in data.groupby("activation"):
        # Sort by epochs before plotting
        group = group.sort_values("epochs")

        plt.plot(
            group["epochs"], 
            group[metric], 
            marker="o", 
            linestyle="-", 
            label=activation
        )

    plt.title(f"{metric} vs epochs (batch_size={batch_size_filter})")
    plt.xlabel("Epochs")
    plt.ylabel(metric.replace("_", " ").title())
    plt.legend(title="Activation")
    plt.grid(True, linestyle="--", alpha=0.6)

    os.makedirs(os.path.join("charts","result"), exist_ok=True)
    plt.savefig(f"charts/result/{metric}_batch{batch_size_filter}.png")
    plt.close()

# Example usage:
for metric in ["rmse_train", "rmse_test", "run_training_time"]:
    plot_metric_by_batch(df, metric, batch_size_filter=1024)  # default batch_size
