# Analysis of Solana Validator Memory consumption

## Preliminary Preparations

In [31]:
from pathlib import Path
import os
import pandas as pd
import re
import pygwalker as pyg
from datetime import datetime

# Set the pandas display option for more rows
pd.set_option('display.max_rows', 100)

In [38]:
def find_project_root() -> Path:
    current_dir = Path(os.getcwd())
    if (current_dir / "pyproject.toml").exists():
        return current_dir
    for parent in current_dir.parents:
        if (parent / "pyproject.toml").exists():
            return parent

project_root = find_project_root()
pidstat_log_path = project_root / 'mainnet-beta-memory.log'
solana_log_path = project_root / 'mainnet-beta-concise.log'

## Preparing Logs for Visualization

### Memory

In [40]:
# Function to process the memory log file with the provided date
def process_pidstat_log_with_date(log_path, year, month, day):
    with open(log_path, 'r') as f:
        lines = f.readlines()

    data = {
        "timestamp": [],
        "UID": [],
        "PID": [],
        "minflt/s": [],
        "majflt/s": [],
        "VSZ": [],
        "RSS": [],
        "%MEM": [],
        "Command": []
    }

    # Regular expression to match the data lines
    data_pattern = re.compile(r'(\d{2}:\d{2}:\d{2})\s+(\d+)\s+(\d+)\s+([\d\.]+)\s+([\d\.]+)\s+(\d+)\s+(\d+)\s+([\d\.]+)\s+(\S+)')

    for i in range(len(lines)):
        line = lines[i].strip()
        if data_pattern.match(line):
            match = data_pattern.match(line)
            # Combine the provided date with the time from the log
            time_str = match.group(1)
            timestamp_str = f"{year}-{month:02d}-{day:02d} {time_str}"
            # Convert the combined string to a datetime object
            timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
            data["timestamp"].append(timestamp)
            data["UID"].append(int(match.group(2)))
            data["PID"].append(int(match.group(3)))
            data["minflt/s"].append(float(match.group(4)))
            data["majflt/s"].append(float(match.group(5)))
            data["VSZ"].append(int(match.group(6)))
            data["RSS"].append(int(match.group(7)))
            data["%MEM"].append(float(match.group(8)))
            data["Command"].append(match.group(9))

    df = pd.DataFrame(data)
    
    return df

# Example inputs for date
year = 2024
month = 8
day = 12

df_pidstat_log = process_pidstat_log_with_date(pidstat_log_path, year, month, day)

# Display the first few rows of the DataFrame
df_pidstat_log

Unnamed: 0,timestamp,UID,PID,minflt/s,majflt/s,VSZ,RSS,%MEM,Command
0,2024-08-12 02:34:48,1001,9414,0.2,0.0,487464,20684,0.00,agave-validator
1,2024-08-12 02:34:48,1001,25702,40.4,0.0,4883500,280224,0.05,agave-validator
2,2024-08-12 02:34:53,1001,9414,0.2,0.0,487464,20684,0.00,agave-validator
3,2024-08-12 02:34:53,1001,25702,52.0,0.0,4883500,279988,0.05,agave-validator
4,2024-08-12 02:34:58,1001,9414,0.2,0.0,487464,20684,0.00,agave-validator
...,...,...,...,...,...,...,...,...,...
2195,2024-08-12 04:46:48,1001,25702,368751.0,715.4,585685364,374477644,72.18,agave-validator
2196,2024-08-12 04:46:53,1001,25702,308153.8,1.2,585685324,377546728,72.77,agave-validator
2197,2024-08-12 04:46:58,1001,25702,307113.8,98.6,585548136,380454464,73.33,agave-validator
2198,2024-08-12 04:47:03,1001,25702,275615.2,39.8,585549748,382533796,73.73,agave-validator


### Program Cache

In [36]:
# Function to remove ANSI escape codes
def remove_ansi_escape_codes(text):
    ansi_escape = re.compile(r'\x1B[@-_][0-?]*[ -/]*[@-~]')
    return ansi_escape.sub('', text)

# Function to process the Solana log file with cleaned timestamps
def process_solana_log(log_path):
    with open(log_path, 'r') as f:
        lines = f.readlines()

    data = {
        "timestamp": [],
        "slot": [],
        "hits": [],
        "misses": [],
        "evictions": [],
        "reloads": [],
        "insertions": [],
        "lost_insertions": [],
        "replace_entry": [],
        "one_hit_wonders": [],
        "prunes_orphan": [],
        "prunes_environment": [],
        "empty_entries": []
    }

    data_pattern = re.compile(
        r'\[(.*?)\]\s+datapoint: loaded-programs-cache-stats '
        r'slot=(\d+)i hits=(\d+)i misses=(\d+)i evictions=(\d+)i '
        r'reloads=(\d+)i insertions=(\d+)i lost_insertions=(\d+)i '
        r'replace_entry=(\d+)i one_hit_wonders=(\d+)i prunes_orphan=(\d+)i '
        r'prunes_environment=(\d+)i empty_entries=(\d+)i'
    )

    for line in lines:
        line = remove_ansi_escape_codes(line.strip())
        match = data_pattern.search(line)
        if match:
            # Extract the timestamp and clean it
            timestamp_str = match.group(1).split()[0]
            # Remove the 'Z' and slice fractional seconds to 6 digits
            timestamp_str = timestamp_str.replace('Z', '')[:-3]
            # Convert the timestamp to a datetime object
            timestamp = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f")
            data["timestamp"].append(timestamp)
            data["slot"].append(int(match.group(2)))
            data["hits"].append(int(match.group(3)))
            data["misses"].append(int(match.group(4)))
            data["evictions"].append(int(match.group(5)))
            data["reloads"].append(int(match.group(6)))
            data["insertions"].append(int(match.group(7)))
            data["lost_insertions"].append(int(match.group(8)))
            data["replace_entry"].append(int(match.group(9)))
            data["one_hit_wonders"].append(int(match.group(10)))
            data["prunes_orphan"].append(int(match.group(11)))
            data["prunes_environment"].append(int(match.group(12)))
            data["empty_entries"].append(int(match.group(13)))

    df = pd.DataFrame(data)
    
    return df

# Process the Solana log file and create the DataFrame
df_solana_log = process_solana_log(solana_log_path)

# Display the first few rows of the DataFrame
df_solana_log

Unnamed: 0,timestamp,slot,hits,misses,evictions,reloads,insertions,lost_insertions,replace_entry,one_hit_wonders,prunes_orphan,prunes_environment,empty_entries
0,2024-08-12 03:01:52.951977,283058530,0,0,0,0,9,0,0,0,0,0,0
1,2024-08-12 03:01:54.771757,283058531,3896,33,0,0,33,0,0,0,0,0,0
2,2024-08-12 03:01:55.250839,283058532,2319,48,0,0,48,0,0,0,0,0,0
3,2024-08-12 03:01:55.743699,283058533,4961,16,0,0,16,0,0,0,0,0,0
4,2024-08-12 03:01:56.113265,283058534,3733,2,0,0,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55924,2024-08-12 09:22:30.410411,283114854,4646,2,2,2,0,0,0,0,0,0,0
55925,2024-08-12 09:22:30.875129,283114855,4194,2,2,2,0,0,0,0,0,0,0
55926,2024-08-12 09:22:31.066838,283114856,1808,0,0,0,0,0,0,0,0,0,0
55927,2024-08-12 09:22:31.354771,283114857,2331,2,2,2,0,0,0,0,0,0,0


## Visualize

In [41]:
# Visualize the data using Pygwalker
pyg.walk(df_solana_log)
pyg.walk(df_pidstat_log)

Box(children=(HTML(value='\n<div id="ifr-pyg-0006212c4f75e0e8M8iErm0BulWfVKAP" style="height: auto">\n    <hea…

Box(children=(HTML(value='\n<div id="ifr-pyg-0006212c4f7b75a3STAcymq1KhC0Nwnp" style="height: auto">\n    <hea…

<pygwalker.api.pygwalker.PygWalker at 0x16c0c0710>