In [None]:
from utils import validate_repeated_blocks, group_by_repeated_blocks, merge_named_columns, csv_to_tensor
import pandas as pd
import os

: 

In [None]:
# if need filter is false all result files in the folder will be used
need_filter = False
# set the folder path containing result files
folder_path = "result/deepseek_r1_distill_qwen_7b/base_expressions"

# set the paths of files to be filtered **only used when need_filter is True**
filter_paths = {}

# parameters for grouping and aggregation (num of generations)
k = 5
# minimum length of the dataframe to be considered
len_filter = 200
# aggregation functions for specific columns
agg_funcs = {"Num of thought Tokens": "min"}


filter_columns = list(agg_funcs.keys())
EXPRESSION_COLUMN = "Expression"

In [None]:



file_names = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

dfs = []
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    # Apply filtering based on filter_paths
    if need_filter and (file_path not in filter_paths):
        continue
    df = pd.read_csv(file_path)
    if len(df) < len_filter:
        continue
    dfs.append((file_name, df))
    print(f"{file_name}:", len(df))

In [None]:
#validate pd
k = 5
for name, df in dfs:
    validate_repeated_blocks(df, column="Expression", k=k)

In [None]:
remain_cols = filter_columns + [EXPRESSION_COLUMN]
eval_df = [(name, group_by_repeated_blocks(df[remain_cols], k, EXPRESSION_COLUMN, agg_funcs)) for name, df in dfs]

In [None]:
import numpy as np

import matplotlib.pyplot as plt

def plot_column_values(df, column_name, show_trend=True, trend_degree=1):
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")

    x_pos = np.arange(len(df))
    y = df[column_name].astype(float).values

    plt.figure(figsize=(8, 5))
    plt.bar(x_pos, y, alpha=0.7)

    if show_trend and len(df) > trend_degree:
        coeffs = np.polyfit(x_pos, y, trend_degree)
        trend = np.polyval(coeffs, x_pos)
        plt.plot(x_pos, trend, color='red', linewidth=2, label=f"Trend (deg {trend_degree})")

        # R^2
        ss_res = np.sum((y - trend) ** 2)
        ss_tot = np.sum((y - y.mean()) ** 2)
        r2 = 1 - ss_res / ss_tot if ss_tot != 0 else float("nan")
        plt.annotate(f"RÂ² = {r2:.3f}", xy=(0.98, 0.98), xycoords="axes fraction",
                     ha="right", va="top", fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="0.8"))

    plt.title(f"{column_name} values by index")
    plt.xlabel("Index")
    plt.ylabel(column_name)
    plt.xticks(x_pos, df.index.astype(str), rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
for column_name in filter_columns:
    for name, df in eval_df:
        print(f"Plotting for {name}")
        plot_column_values(df, column_name)

In [None]:
general_df = merge_named_columns(eval_df, filter_columns)
general_df.head(5)

: 

: 

In [None]:
tensor_data = csv_to_tensor(general_df)
tensor_data.shape

In [None]:
row_mean = tensor_data.mean(dim=1)           # [5.5, 6.5, 7.5]
diffs = row_mean[1:] - row_mean[:-1]  # [6.5-5.5, 7.5-6.5]
print(len(diffs))
diffs