In [28]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import numpy as np
import warnings

def adf_test(series):
    """
    Perform ADF test and return the ADF statistic, critical t-score at 5%, and p-value.
    Suppress warnings for divide by zero.
    """
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        try:
            result = adfuller(series, autolag='AIC')
            adf_stat = result[0]  # ADF statistic
            p_value = result[1]    # p-value
            t_score_5_percent = result[4]['5%']  # Critical t-score at 5% significance level
            return adf_stat, t_score_5_percent, p_value
        except (ValueError, ZeroDivisionError, np.linalg.LinAlgError):
            return np.nan, np.nan, 1.0  # Return NaN for ADF and t-score, 1.0 for p-value in case of errors

def calculate_stability(data, window_size=10):
    """
    Calculate the stability of each type using ADF test within a sliding window.
    The stability is determined by the ADF statistic, critical t-score at 5%, and p-value.
    """
    num_timesteps = len(data[0])  # Number of timesteps
    num_types = len(data)  # Number of types
    
    adf_values = []
    t_scores_5_percent = []
    p_values = []
    
    for i in range(num_timesteps - window_size + 1):
        adf_values_row = []
        t_scores_5_percent_row = []
        p_values_row = []
        for type_index in range(num_types):
            window = data[type_index][i:i + window_size]
            adf_stat, t_score_5_percent, p_value = adf_test(window)
            adf_values_row.append(adf_stat)
            t_scores_5_percent_row.append(t_score_5_percent)
            p_values_row.append(p_value)
        adf_values.append(adf_values_row)
        t_scores_5_percent.append(t_scores_5_percent_row)
        p_values.append(p_values_row)
    
    return pd.DataFrame(adf_values), pd.DataFrame(t_scores_5_percent), pd.DataFrame(p_values)

def find_most_stable_timestep(p_value_df):
    """
    Determine the time step where the types are most stable.
    We define 'most stable' as the time step where the sum of ADF p-values across all types is lowest.
    """
    # Add a new column that takes the median of the p-values across all types for each timestep
    p_value_df['total_stability'] = p_value_df.median(axis=1)
    
    # The index with the lowest total stability value indicates the most stable timestep
    most_stable_index = p_value_df['total_stability'].nsmallest(5)
    return most_stable_index

In [29]:
filename = 'random_1'
path = "../blossom/hpc/outputs/"
df = pd.read_csv(path + filename + ".csv")
counts_per_type = df.value_counts(['type', 'tick'])

data = [counts_per_type[t].reindex(range(len(df['tick'].unique())), fill_value=0).sort_index().to_list() for t in range(9)]

In [30]:
# Calculate stability for each type
adf_df, t_score_5_percent_df, p_value_df = calculate_stability(data, window_size=18)

# Find the timestep with the most stability
most_stable_timestep = find_most_stable_timestep(p_value_df)
print(most_stable_timestep.index)

Index([83, 8, 129, 76, 10], dtype='int64')
