In [1]:
import pandas as pd
import numpy as np
import time
import os

def generate_count_df(rows, cols, min_val=0, max_val=100, seed=42):
    rng = np.random.default_rng(seed)
    data = rng.integers(min_val, max_val + 1, size=(rows, cols))
    return pd.DataFrame(data)

def write_df_to_txt(df, filename="temp_df.txt"):
    df.to_csv(filename, sep='\t', header=False, index=False)

def read_df_from_txt(filename="temp_df.txt"):
    return pd.read_csv(filename, sep='\t', header=None)

def profile_pandas_operations(df_row, df_col, label="DataFrame", n_iter=100, filename_prefix="temp_df"):
    print(f"--- {label} Test ({df_row.shape[0]} x {df_row.shape[1]}) ---")
    print("DataFrame shapes:")
    print(f"- row-major: {df_row.shape}")
    print(f"- col-major: {df_col.shape}")

    operations = {
        "row-major sum": lambda: df_row.sum(axis=1),
        "col-major sum": lambda: df_col.sum(axis=0),
        "row-major mean": lambda: df_row.mean(axis=1),
        "col-major mean": lambda: df_col.mean(axis=0),
        "row-major std": lambda: df_row.std(axis=1),
        "col-major std": lambda: df_col.std(axis=0),
        "row-major transpose": lambda: df_row.T,
        "col-major transpose": lambda: df_col.T,
        "row-major reshape": lambda: pd.DataFrame(df_row.values.reshape(-1, 50)),
        "col-major reshape": lambda: pd.DataFrame(df_col.values.reshape(-1, 50)),
        "row-major write to txt": lambda: write_df_to_txt(df_row, f"{filename_prefix}_row.txt"),
        "col-major write to txt": lambda: write_df_to_txt(df_col, f"{filename_prefix}_col.txt"),
        "row-major read from txt": lambda: read_df_from_txt(f"{filename_prefix}_row.txt"),
        "col-major read from txt": lambda: read_df_from_txt(f"{filename_prefix}_col.txt"),
    }

    print("\n=== Multiple Operations Profiling ===")
    for op_name, operation in operations.items():
        times = []
        current_iter = n_iter if "read" not in op_name and "write" not in op_name else 10
        for _ in range(current_iter):
            start = time.perf_counter()
            operation()
            end = time.perf_counter()
            times.append(end - start)
        print(f"{op_name:30s}: {np.mean(times):.6f} ± {np.std(times):.6f} seconds (mean ± std over {current_iter} runs)")

    for suffix in ["row", "col"]:
        file = f"{filename_prefix}_{suffix}.txt"
        if os.path.exists(file):
            os.remove(file)

def run_all_profiles():
    sizes = {
        "Small": (10_000, 100),
        "Medium": (100_000, 500),
        "Large": (1_000_000, 500),
    }
    for label, (n_row, n_col) in sizes.items():
        df_row = generate_count_df(n_row, n_col)
        df_col = generate_count_df(n_col, n_row)
        profile_pandas_operations(df_row, df_col, label=label, filename_prefix=f"temp_{label.lower()}_df")
        print("\n\n")

run_all_profiles()

--- Small Test (10000 x 100) ---
DataFrame shapes:
- row-major: (10000, 100)
- col-major: (100, 10000)

=== Multiple Operations Profiling ===
row-major sum                 : 0.001716 ± 0.001589 seconds (mean ± std over 100 runs)
col-major sum                 : 0.001296 ± 0.000115 seconds (mean ± std over 100 runs)
row-major mean                : 0.002397 ± 0.000142 seconds (mean ± std over 100 runs)
col-major mean                : 0.002023 ± 0.000114 seconds (mean ± std over 100 runs)
row-major std                 : 0.023837 ± 0.053554 seconds (mean ± std over 100 runs)
col-major std                 : 0.016518 ± 0.027746 seconds (mean ± std over 100 runs)
row-major transpose           : 0.000074 ± 0.000050 seconds (mean ± std over 100 runs)
col-major transpose           : 0.000758 ± 0.000048 seconds (mean ± std over 100 runs)
row-major reshape             : 0.000019 ± 0.000008 seconds (mean ± std over 100 runs)
col-major reshape             : 0.000017 ± 0.000002 seconds (mean ± std ove

# legacy code

In [None]:
n_col = 10000
n_row = 10000
data_rows = pd.DataFrame(np.random.rand(n_col, n_row))

start_time = time.time()
# Compute std for each feature (row-wise)
std_rows = data_rows.std(axis=1)
print(std_rows.shape)
elapsed_rows = time.time() - start_time

print(f"Case 1 (features as rows): {elapsed_rows:.4f} seconds")