## Preliminary preparations

In [1]:
from datetime import datetime
import os
from pathlib import Path

import pandas as pd
import numpy as np
import re
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)

In [2]:
def find_project_root() -> Path:
    current_dir = Path(os.getcwd())
    if (current_dir / "pyproject.toml").exists():
        return current_dir
    for parent in current_dir.parents:
        if (parent / "pyproject.toml").exists():
            return parent

In [3]:
# project_root = find_project_root()
# devnet_log_path = project_root / 'resource' / 'task' / 'solana_node_tps' /'transaction-only-256-GB-2024-10-05-06-47-36-mainnet-beta.log'

## Preparing Data Source

In [4]:
data = []

with open("../logs/256G/transaction-only-256-GB-2024-10-05-06-47-36-mainnet-beta.log", 'r') as file:
    for line in file:
        if "datapoint: replay-slot-stats" in line:

            timestamp_match = re.search(r"\[(.*?)Z", line)
            transactions_match = re.search(r"total_transactions=(\d+)i", line)
            execute_us_match = re.search(r"execute_us=(\d+)i", line)
            
            if timestamp_match and transactions_match and execute_us_match:
                timestamp = timestamp_match.group(1)
                total_transactions_executed = int(transactions_match.group(1))
                execute_us = int(execute_us_match.group(1))

                data.append([timestamp, total_transactions_executed, execute_us])

df = pd.DataFrame(data, columns=['timestamp', 'total_transactions_executed', 'execute_us'])
df

Unnamed: 0,timestamp,total_transactions_executed,execute_us
0,2024-10-05T08:39:10.918597354,1272,759070
1,2024-10-05T08:39:13.603789519,1775,647401
2,2024-10-05T08:39:15.928884370,483,809528
3,2024-10-05T08:39:17.953903404,544,728594
4,2024-10-05T08:39:19.405022172,661,792794
...,...,...,...
11232,2024-10-05T11:23:16.406028395,620,849752
11233,2024-10-05T11:23:32.273212099,1258,4832707
11234,2024-10-05T11:23:49.760967447,1302,17468517
11235,2024-10-05T11:24:21.534058598,1934,27342407


In [5]:
def convert_to_unixtime(timestamp):
    base_time, microseconds = timestamp.split('.')
    microseconds = microseconds[:6]
    dt = datetime.strptime(base_time, '%Y-%m-%dT%H:%M:%S')
    return int(dt.timestamp() * 1_000_000) + int(microseconds)

In [6]:
df['end'] = df['timestamp'].apply(convert_to_unixtime)
df['start'] = df['end'] - df['execute_us']
df['tps'] = df['total_transactions_executed'] * 1000000 / df['execute_us']
df


Unnamed: 0,timestamp,total_transactions_executed,execute_us,end,start,tps
0,2024-10-05T08:39:10.918597354,1272,759070,1728142750918597,1728142750159527,1675.734781
1,2024-10-05T08:39:13.603789519,1775,647401,1728142753603789,1728142752956388,2741.731940
2,2024-10-05T08:39:15.928884370,483,809528,1728142755928884,1728142755119356,596.643970
3,2024-10-05T08:39:17.953903404,544,728594,1728142757953903,1728142757225309,746.643535
4,2024-10-05T08:39:19.405022172,661,792794,1728142759405022,1728142758612228,833.760094
...,...,...,...,...,...,...
11232,2024-10-05T11:23:16.406028395,620,849752,1728152596406028,1728152595556276,729.624643
11233,2024-10-05T11:23:32.273212099,1258,4832707,1728152612273212,1728152607440505,260.309595
11234,2024-10-05T11:23:49.760967447,1302,17468517,1728152629760967,1728152612292450,74.534089
11235,2024-10-05T11:24:21.534058598,1934,27342407,1728152661534058,1728152634191651,70.732617


In [8]:
start_time = df.start.min() // 1000000 * 1000000
end_time = (df.start.max() + 1000000) // 1000000 * 1000000
time_step = 100000

time_series = pd.DataFrame({
    't': np.arange(start_time, end_time + time_step, time_step)
})

def aggregate_tps(row, df):
    mask = (df['start'] <= row['t']) & (df['end'] >= row['t'])
    return df.loc[mask, 'tps'].sum()

time_series['sum_tps'] = time_series.apply(aggregate_tps, axis=1, df=df)
# window=10 means creating a moving average with the past 1 second's data. If you change it to 100, it would be the past 10 seconds
time_series['moving_average'] = time_series['sum_tps'].rolling(window=10, min_periods=1).mean()
time_series.to_csv('tps_256gb.csv', index=False)


In [9]:
# Display every 10 seconds
filtered_time_series = time_series[time_series['t'] % 10000000 == 0]
# from micro sec to sec
filtered_time_series['t'] = np.floor(filtered_time_series['t'] / 1000000)
filtered_time_series

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_time_series['t'] = np.floor(filtered_time_series['t'] / 1000000)


Unnamed: 0,t,sum_tps,moving_average
0,1.728143e+09,0.000000,0.000000
100,1.728143e+09,0.000000,333.504038
200,1.728143e+09,1122.047580,826.354692
300,1.728143e+09,0.000000,0.000000
400,1.728143e+09,0.000000,965.122685
...,...,...,...
98700,1.728153e+09,74.534089,74.534089
98800,1.728153e+09,0.000000,52.173862
98900,1.728153e+09,70.732617,70.732617
99000,1.728153e+09,70.732617,70.732617


In [10]:
fig = go.Figure()


fig.add_trace(go.Scatter(x=filtered_time_series['t'], y=filtered_time_series['sum_tps'], mode='markers', name='TPS', marker=dict(size=2)))
fig.add_trace(go.Scatter(x=filtered_time_series['t'], y=filtered_time_series['moving_average'], mode='lines', name='TPS Moving Average(1sec)', line=dict(width=0.5)))

fig.update_layout(
    title="TPS Over Time",
    xaxis_title="Epoch Time(sec)",
    yaxis_title="TPS",
    xaxis=dict(tickformat='%d'),
    yaxis=dict(range=[0, 15000])
)

fig.show()


# ## Notes
# - If all data points are plotted on the graph, the points and lines become indistinguishable, so I plot them every 10 seconds.
# - TPS values above 15000 are excluded as outliers.
# - Upon reviewing tps_512gb.csv, it is observed that there are a significant number of ranges where tps is 0.
