In [1]:
import pandas as pd
import numpy as np
import time

def generate_count_df(rows, cols, min_val=0, max_val=100, seed=42):
    """
    Generate a DataFrame of random integer counts with specified dimensions
    """
    rng = np.random.default_rng(seed)
    count_df  = pd.DataFrame(rng.integers(min_val, max_val + 1, size=(rows, cols)))
    return count_df

# small matrix

In [2]:
# Generate test matrices
n_row = 10_000
n_col = 500
df1_row = generate_count_df(n_row, n_col)
df1_col = generate_count_df(n_col, n_row)


# Generate test DataFrames
print(f"--- Small Matrix Test ({n_row} x {n_col}) ---")
print("DataFrame shapes:")
print(f"- df1_row: {df1_row.shape}")
print(f"- df1_col: {df1_col.shape}")


operations = {
    "df1_row std": lambda: df1_row.std(axis=1),
    "df1_col std": lambda: df1_col.std(axis=0),
    "df1_row mean": lambda: df1_row.mean(axis=1),
    "df1_col mean": lambda: df1_col.mean(axis=0),
    "df1_row sum": lambda: df1_row.sum(axis=1),
    "df1_col sum": lambda: df1_col.sum(axis=0),
    "df1_row transpose": lambda: df1_row.T,
    "df1_col transpose": lambda: df1_col.T,
    "df1_row reshape": lambda: pd.DataFrame(df1_row.values.reshape(-1, 50)),
    "df1_col reshape": lambda: pd.DataFrame(df1_col.values.reshape(-1, 50)),
}

print("\n=== Multiple Operations Profiling ===")
# profile multiple DataFrame operations
for op_name, operation in operations.items():
    times = []
    for _ in range(1000):
        start_time = time.perf_counter()
        operation()
        end_time = time.perf_counter()
        times.append(end_time - start_time)

    mean_time = np.mean(times)
    std_time = np.std(times)
    print(f"{op_name:17s}: {mean_time:.6f} ± {std_time:.6f} seconds (mean ± std over 1000 runs)")

--- Small Matrix Test (10000 x 500) ---
DataFrame shapes:
- df1_row: (10000, 500)
- df1_col: (500, 10000)

=== Multiple Operations Profiling ===
df1_row std      : 0.051562 ± 0.000987 seconds (mean ± std over 1000 runs)
df1_col std      : 0.048520 ± 0.001471 seconds (mean ± std over 1000 runs)
df1_row mean     : 0.006009 ± 0.000179 seconds (mean ± std over 1000 runs)
df1_col mean     : 0.005399 ± 0.000371 seconds (mean ± std over 1000 runs)
df1_row sum      : 0.003168 ± 0.000127 seconds (mean ± std over 1000 runs)
df1_col sum      : 0.003037 ± 0.000126 seconds (mean ± std over 1000 runs)
df1_row transpose: 0.000083 ± 0.000006 seconds (mean ± std over 1000 runs)
df1_col transpose: 0.000638 ± 0.000022 seconds (mean ± std over 1000 runs)
df1_row reshape  : 0.000014 ± 0.000006 seconds (mean ± std over 1000 runs)
df1_col reshape  : 0.000014 ± 0.000003 seconds (mean ± std over 1000 runs)


# large matrix

In [3]:
# Generate test matrices
n_row = 100_000
n_col = 500

df2_row = generate_count_df(n_row, n_col)
df2_col = generate_count_df(n_col, n_row)


# Generate test DataFrames
print(f"--- Large Matrix Test ({n_row} x {n_col}) ---")
print("DataFrame shapes:")
print(f"- df2_row: {df2_row.shape}")
print(f"- df2_col: {df2_col.shape}")


operations = {
    "df2_row std": lambda: df2_row.std(axis=1),
    "df2_col std": lambda: df2_col.std(axis=0),
    "df2_row mean": lambda: df2_row.mean(axis=1),
    "df2_col mean": lambda: df2_col.mean(axis=0),
    "df2_row sum": lambda: df2_row.sum(axis=1),
    "df2_col sum": lambda: df2_col.sum(axis=0),
    "df2_row transpose": lambda: df2_row.T,
    "df2_col transpose": lambda: df2_col.T,
    "df2_row reshape": lambda: pd.DataFrame(df2_row.values.reshape(-1, 50)),
    "df2_col reshape": lambda: pd.DataFrame(df2_col.values.reshape(-1, 50)),
}

print("\n=== Multiple Operations Profiling ===")
# profile multiple matrix operations
for op_name, operation in operations.items():
    # Measure individual run times
    times = []
    for _ in range(1000):
        start_time = time.perf_counter()
        operation()
        end_time = time.perf_counter()
        times.append(end_time - start_time)
    
    mean_time = np.mean(times)
    std_time = np.std(times)
    print(f"{op_name:15s}: {mean_time:.6f} ± {std_time:.6f} seconds (mean ± std over 1000 runs)") 

--- Large Matrix Test (100000 x 500) ---
DataFrame shapes:
- df2_row: (100000, 500)
- df2_col: (500, 100000)

=== Multiple Operations Profiling ===
df2_row std    : 1.079865 ± 0.524880 seconds (mean ± std over 1000 runs)
df2_col std    : 0.600393 ± 0.177012 seconds (mean ± std over 1000 runs)
df2_row mean   : 0.067530 ± 0.000698 seconds (mean ± std over 1000 runs)
df2_col mean   : 0.070661 ± 0.007745 seconds (mean ± std over 1000 runs)
df2_row sum    : 0.046184 ± 0.000347 seconds (mean ± std over 1000 runs)
df2_col sum    : 0.052997 ± 0.000224 seconds (mean ± std over 1000 runs)
df2_row transpose: 0.000084 ± 0.000011 seconds (mean ± std over 1000 runs)
df2_col transpose: 0.005900 ± 0.000114 seconds (mean ± std over 1000 runs)
df2_row reshape: 0.000014 ± 0.000011 seconds (mean ± std over 1000 runs)
df2_col reshape: 0.000014 ± 0.000002 seconds (mean ± std over 1000 runs)


# legacy code

In [None]:
data_rows = pd.DataFrame(np.random.rand(n_col, n_row))

start_time = time.time()
# Compute std for each feature (row-wise)
std_rows = data_rows.std(axis=1)
print(std_rows.shape)
elapsed_rows = time.time() - start_time

print(f"Case 1 (features as rows): {elapsed_rows:.4f} seconds")