## Preliminary preparations

In [1]:
from datetime import datetime
import os
from pathlib import Path

import pandas as pd
import numpy as np
import re
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)

In [2]:
def find_project_root() -> Path:
    current_dir = Path(os.getcwd())
    if (current_dir / "pyproject.toml").exists():
        return current_dir
    for parent in current_dir.parents:
        if (parent / "pyproject.toml").exists():
            return parent

In [3]:
project_root = find_project_root()
devnet_log_path = project_root / 'resource' / 'task' / 'solana_node_tps' /'transaction-only-512GB-2024-10-03-01-13-56-mainnet-beta.log'

## Preparing Data Source

In [4]:
data = []

with open(devnet_log_path, 'r') as file:
    for line in file:
        if "datapoint: replay-slot-stats" in line:

            timestamp_match = re.search(r"\[(.*?)Z", line)
            transactions_match = re.search(r"total_transactions=(\d+)i", line)
            execute_us_match = re.search(r"execute_us=(\d+)i", line)
            
            if timestamp_match and transactions_match and execute_us_match:
                timestamp = timestamp_match.group(1)
                total_transactions_executed = int(transactions_match.group(1))
                execute_us = int(execute_us_match.group(1))

                data.append([timestamp, total_transactions_executed, execute_us])

df = pd.DataFrame(data, columns=['timestamp', 'total_transactions_executed', 'execute_us'])
df

Unnamed: 0,timestamp,total_transactions_executed,execute_us
0,2024-10-03T01:54:13.341674791,1678,752062
1,2024-10-03T01:54:14.354422787,1363,400092
2,2024-10-03T01:54:17.071737710,1429,760567
3,2024-10-03T01:54:18.092171846,1653,710464
4,2024-10-03T01:54:19.128107820,1072,775704
...,...,...,...
75659,2024-10-03T13:13:54.272361862,1208,696691
75660,2024-10-03T13:13:54.692741070,1032,679144
75661,2024-10-03T13:13:54.968330530,1876,249994
75662,2024-10-03T13:13:55.303605062,1655,420106


In [5]:
def convert_to_unixtime(timestamp):
    base_time, microseconds = timestamp.split('.')
    microseconds = microseconds[:6]
    dt = datetime.strptime(base_time, '%Y-%m-%dT%H:%M:%S')
    return int(dt.timestamp() * 1_000_000) + int(microseconds)

In [6]:
df['end'] = df['timestamp'].apply(convert_to_unixtime)
df['start'] = df['end'] - df['execute_us']
df['tps'] = df['total_transactions_executed'] * 1000000 / df['execute_us']
df


Unnamed: 0,timestamp,total_transactions_executed,execute_us,end,start,tps
0,2024-10-03T01:54:13.341674791,1678,752062,1727888053341674,1727888052589612,2231.199023
1,2024-10-03T01:54:14.354422787,1363,400092,1727888054354422,1727888053954330,3406.716455
2,2024-10-03T01:54:17.071737710,1429,760567,1727888057071737,1727888056311170,1878.861428
3,2024-10-03T01:54:18.092171846,1653,710464,1727888058092171,1727888057381707,2326.648500
4,2024-10-03T01:54:19.128107820,1072,775704,1727888059128107,1727888058352403,1381.970442
...,...,...,...,...,...,...
75659,2024-10-03T13:13:54.272361862,1208,696691,1727928834272361,1727928833575670,1733.910729
75660,2024-10-03T13:13:54.692741070,1032,679144,1727928834692741,1727928834013597,1519.559917
75661,2024-10-03T13:13:54.968330530,1876,249994,1727928834968330,1727928834718336,7504.180100
75662,2024-10-03T13:13:55.303605062,1655,420106,1727928835303605,1727928834883499,3939.481940


In [33]:
start_time = df.start.min() // 1000000 * 1000000
end_time = (df.start.max() + 1000000) // 1000000 * 1000000
time_step = 100000

time_series = pd.DataFrame({
    't': np.arange(start_time, end_time + time_step, time_step)
})

def aggregate_tps(row, df):
    mask = (df['start'] <= row['t']) & (df['end'] >= row['t'])
    return df.loc[mask, 'tps'].sum()

time_series['sum_tps'] = time_series.apply(aggregate_tps, axis=1, df=df)
# window=10 means creating a moving average with the past 1 second's data. If you change it to 100, it would be the past 10 seconds
time_series['moving_average'] = time_series['sum_tps'].rolling(window=10, min_periods=1).mean()
time_series.to_csv('tps_512gb.csv', index=False)


In [34]:
# Display every 10 seconds
filtered_time_series = time_series[time_series['t'] % 10000000 == 0]
# from micro sec to sec
filtered_time_series['t'] = np.floor(filtered_time_series['t'] / 1000000)
filtered_time_series



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,t,sum_tps,moving_average
80,1.727888e+09,1188.275010,1794.771317
180,1.727888e+09,8287.097902,4850.751297
280,1.727888e+09,1616.248654,1082.740602
380,1.727888e+09,2704.963536,2290.201788
480,1.727888e+09,2636.993569,1318.496784
...,...,...,...
407380,1.727929e+09,5631.727022,4714.706313
407480,1.727929e+09,5528.554517,3502.560315
407580,1.727929e+09,1571.731639,3224.549585
407680,1.727929e+09,3267.847100,2874.703518


In [40]:
fig = go.Figure()


fig.add_trace(go.Scatter(x=filtered_time_series['t'], y=filtered_time_series['sum_tps'], mode='markers', name='TPS', marker=dict(size=2)))
fig.add_trace(go.Scatter(x=filtered_time_series['t'], y=filtered_time_series['moving_average'], mode='lines', name='TPS Moving Average(1sec)', line=dict(width=0.5)))

fig.update_layout(
    title="TPS Over Time",
    xaxis_title="Epoch Time(sec)",
    yaxis_title="TPS",
    xaxis=dict(tickformat='%d'),
    yaxis=dict(range=[0, 15000])
)

fig.show()


# ## Notes
# - If all data points are plotted on the graph, the points and lines become indistinguishable, so I plot them every 10 seconds.
# - TPS values above 15000 are excluded as outliers.
