## Preliminary preparations

In [1]:
from datetime import datetime
import os
from pathlib import Path

import pandas as pd
import numpy as np
import re
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)

In [2]:
def find_project_root() -> Path:
    current_dir = Path(os.getcwd())
    if (current_dir / "pyproject.toml").exists():
        return current_dir
    for parent in current_dir.parents:
        if (parent / "pyproject.toml").exists():
            return parent

In [3]:
project_root = find_project_root()
devnet_log_path = project_root / 'resource' / 'task' / 'solana_node_tps' /'transaction-only-128GB-2024-10-07-06-55-38-mainnet-beta.log'

## Preparing Data Source

In [4]:
data = []

with open(devnet_log_path, 'r') as file:
    for line in file:
        if "datapoint: replay-slot-stats" in line:

            timestamp_match = re.search(r"\[(.*?)Z", line)
            transactions_match = re.search(r"total_transactions=(\d+)i", line)
            execute_us_match = re.search(r"execute_us=(\d+)i", line)
            
            if timestamp_match and transactions_match and execute_us_match:
                timestamp = timestamp_match.group(1)
                total_transactions_executed = int(transactions_match.group(1))
                execute_us = int(execute_us_match.group(1))

                data.append([timestamp, total_transactions_executed, execute_us])

df = pd.DataFrame(data, columns=['timestamp', 'total_transactions_executed', 'execute_us'])
df

Unnamed: 0,timestamp,total_transactions_executed,execute_us
0,2024-10-07T09:32:51.345610800,1019,753235
1,2024-10-07T09:32:56.351082564,1072,587296
2,2024-10-07T09:32:57.397677861,2215,643038
3,2024-10-07T09:32:59.756236618,1684,667559
4,2024-10-07T09:33:01.285146452,1389,466778
...,...,...,...
8676,2024-10-07T12:08:12.617945334,1050,4712362
8677,2024-10-07T12:08:22.825841644,1153,10593918
8678,2024-10-07T12:08:24.258412322,1685,2843369
8679,2024-10-07T12:08:26.165700311,1039,1372793


In [5]:
def convert_to_unixtime(timestamp):
    base_time, microseconds = timestamp.split('.')
    microseconds = microseconds[:6]
    dt = datetime.strptime(base_time, '%Y-%m-%dT%H:%M:%S')
    return int(dt.timestamp() * 1_000_000) + int(microseconds)

In [6]:
df['end'] = df['timestamp'].apply(convert_to_unixtime)
df['start'] = df['end'] - df['execute_us']
df['tps'] = df['total_transactions_executed'] * 1000000 / df['execute_us']
df


Unnamed: 0,timestamp,total_transactions_executed,execute_us,end,start,tps
0,2024-10-07T09:32:51.345610800,1019,753235,1728261171345610,1728261170592375,1352.831454
1,2024-10-07T09:32:56.351082564,1072,587296,1728261176351082,1728261175763786,1825.314662
2,2024-10-07T09:32:57.397677861,2215,643038,1728261177397677,1728261176754639,3444.586479
3,2024-10-07T09:32:59.756236618,1684,667559,1728261179756236,1728261179088677,2522.623468
4,2024-10-07T09:33:01.285146452,1389,466778,1728261181285146,1728261180818368,2975.718650
...,...,...,...,...,...,...
8676,2024-10-07T12:08:12.617945334,1050,4712362,1728270492617945,1728270487905583,222.818196
8677,2024-10-07T12:08:22.825841644,1153,10593918,1728270502825841,1728270492231923,108.836032
8678,2024-10-07T12:08:24.258412322,1685,2843369,1728270504258412,1728270501415043,592.606869
8679,2024-10-07T12:08:26.165700311,1039,1372793,1728270506165700,1728270504792907,756.851179


In [7]:
start_time = df.start.min() // 1000000 * 1000000
end_time = (df.start.max() + 1000000) // 1000000 * 1000000
time_step = 100000

time_series = pd.DataFrame({
    't': np.arange(start_time, end_time + time_step, time_step)
})

def aggregate_tps(row, df):
    mask = (df['start'] <= row['t']) & (df['end'] >= row['t'])
    return df.loc[mask, 'tps'].sum()

time_series['sum_tps'] = time_series.apply(aggregate_tps, axis=1, df=df)
# window=10 means creating a moving average with the past 1 second's data. If you change it to 100, it would be the past 10 seconds
time_series['moving_average'] = time_series['sum_tps'].rolling(window=10, min_periods=1).mean()
time_series.to_csv('tps_128gb.csv', index=False)


In [10]:
# Display every 1 seconds
filtered_time_series = time_series[time_series['t'] % 1000000 == 0]
# from micro sec to sec
filtered_time_series['t'] = np.floor(filtered_time_series['t'] / 1000000)
filtered_time_series



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,t,sum_tps,moving_average
0,1.728261e+09,0.000000,0.000000
10,1.728261e+09,1352.831454,676.415727
20,1.728261e+09,0.000000,405.849436
30,1.728261e+09,0.000000,0.000000
40,1.728261e+09,0.000000,0.000000
...,...,...,...
93320,1.728271e+09,701.442901,464.400153
93330,1.728271e+09,592.606869,679.675695
93340,1.728271e+09,592.606869,592.606869
93350,1.728271e+09,756.851179,345.576727


In [12]:
fig = go.Figure()


fig.add_trace(go.Scatter(x=filtered_time_series['t'], y=filtered_time_series['sum_tps'], mode='markers', name='TPS', marker=dict(size=2)))
fig.add_trace(go.Scatter(x=filtered_time_series['t'], y=filtered_time_series['moving_average'], mode='lines', name='TPS Moving Average(1sec)', line=dict(width=0.5)))

fig.update_layout(
    title="TPS Over Time",
    xaxis_title="Epoch Time(sec)",
    yaxis_title="TPS",
    xaxis=dict(tickformat='%d'),
    yaxis=dict(range=[0, 10000])
)

fig.show()


## Notes
- If all data points are plotted on the graph, the points and lines become indistinguishable, so I plot them every 1 seconds.
- TPS values above 10000 are excluded as outliers.
- Upon reviewing tps_128gb.csv, it was found that there are quite a few areas where tps is 0. As a result, the moving average is also fluctuating.
