In [9]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import warnings
from pathlib import Path

# --- Configuration ---

# 1. DEFINING BASE DIRECTORY
HOME_DIR = Path.home()
BASE_DIR = HOME_DIR / "kohv04" / "backtesting_final"
# 2. DEFINING DATA LOCATIONS (using os.path.join for robust path creation)
SIMULATION_RESULTS_DIR = os.path.join(BASE_DIR, "simulation_results")
METADATA_DIR = os.path.join(BASE_DIR, "metadata")
REGIME_FILE = os.path.join(METADATA_DIR, "all_regimes.csv")

# 3.OUTPUT DIRECTORY
OUTPUT_DIR = os.path.join(BASE_DIR, "hypothesis_testing")
AGGREGATED_TRADES_FILE = os.path.join(OUTPUT_DIR, "trade_level_regime_analysis.csv")
HYPOTHESIS_RESULTS_FILE = os.path.join(OUTPUT_DIR, "hypothesis_1.csv")

# 4. STRATEGY CATEGORIES FOR HYPOTHESIS TESTING
STRATEGY_MAP = {
    'breakout': [
        'Baseline_Breakout',
        'Volume-Enhanced_Breakout',
        'Deep_Learning_Breakout'
    ],
    'mean_reversion': [
        'Baseline_Bollinger_Bands',
        'Volume-Enhanced_VWAP_Reversion',
        'Deep_Learning_VWAP_Reversion'
    ],
    'momentum': [
        'Baseline_Momentum',
        'Volume-Enhanced_Momentum',
        'Deep_Learning_Momentum'
    ]
}

print(HOME_DIR)
print(BASE_DIR)
print(SIMULATION_RESULTS_DIR)
print(METADATA_DIR)
print(REGIME_FILE)
print(OUTPUT_DIR)
print(AGGREGATED_TRADES_FILE)

/home/jupyter-kohv04@vse.cz
/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final
/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/simulation_results
/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/metadata
/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/metadata/all_regimes.csv
/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/hypothesis_testing
/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/hypothesis_testing/trade_level_regime_analysis.csv


# Hypothesis 1 - Regime-Specific Performance 
Intraday trading strategies exhibit distinct performance profiles across market regimes.

In [18]:
def aggregate_trade_data():
    """
    Part 1: Aggregates all trades from all simulations and merges them with
    daily market regime data. It also cleans the data for robustness.
    """
    print("--- Part 1: Starting Data Aggregation ---")
    
    if os.path.exists(AGGREGATED_TRADES_FILE):
        print(f"Aggregated file already exists at {AGGREGATED_TRADES_FILE}. Loading it.")
        try:
            return pd.read_csv(AGGREGATED_TRADES_FILE)
        except Exception as e:
            print(f"Could not load existing file, will re-aggregate. Error: {e}")

    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        print(f"Output directory ensured at: {OUTPUT_DIR}")
    except Exception as e:
        print(f"FATAL ERROR: Could not create directory: {e}")
        return None

    if not os.path.exists(REGIME_FILE):
        print(f"ERROR: Regime file not found at {REGIME_FILE}")
        return None

    print("Loading regime data...")
    regime_df = pd.read_csv(REGIME_FILE)
    regime_df['date'] = pd.to_datetime(regime_df['date']).dt.date
    regime_df.set_index(['ticker', 'date'], inplace=True)
    print("Regime data loaded successfully.")

    all_trades_list = []
    
    try:
        with os.scandir(SIMULATION_RESULTS_DIR) as tickers:
            for ticker_entry in tickers:
                if ticker_entry.is_dir():
                    ticker_name = ticker_entry.name
                    ticker_path = ticker_entry.path
                    with os.scandir(ticker_path) as strategies:
                        for strategy_entry in strategies:
                            if strategy_entry.is_dir():
                                strategy_name = strategy_entry.name
                                trades_file = os.path.join(strategy_entry.path, "trades.csv")
                                if os.path.exists(trades_file):
                                    try:
                                        trades_df = pd.read_csv(trades_file)
                                        if trades_df.empty: continue
                                        trades_df['Ticker'] = ticker_name
                                        trades_df['Strategy'] = strategy_name
                                        trades_df['date'] = pd.to_datetime(trades_df['Entry Timestamp']).dt.date
                                        merged_df = trades_df.reset_index().merge(
                                            regime_df.reset_index(),
                                            left_on=['Ticker', 'date'],
                                            right_on=['ticker', 'date'],
                                            how='left'
                                        )
                                        all_trades_list.append(merged_df)
                                    except Exception as e:
                                        print(f"Could not process {trades_file}. Error: {e}")
    except FileNotFoundError:
        print(f"FATAL ERROR: The directory {SIMULATION_RESULTS_DIR} was not found.")
        return None

    if not all_trades_list:
        print("ERROR: No trade data could be aggregated.")
        return None

    print("\nConcatenating all trades into a single DataFrame...")
    final_df = pd.concat(all_trades_list, ignore_index=True)

    print("Cleaning data: Checking for infinite values in 'Return' column...")
    initial_rows = len(final_df)
    final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    final_df.dropna(subset=['Return'], inplace=True)
    cleaned_rows = len(final_df)
    print(f"Removed {initial_rows - cleaned_rows} rows with invalid 'Return' values.")

    final_df.to_csv(AGGREGATED_TRADES_FILE, index=False)
    print(f"Success! Aggregated data saved to: {AGGREGATED_TRADES_FILE}")
    return final_df


def test_hypotheses(df):
    """
    Part 2: Loads the aggregated data, runs robust statistical tests, and saves
    the detailed results to a CSV file.
    """
    print("\n--- Part 2: Starting Hypothesis Testing ---")

    if df is None or df.empty:
        print("ERROR: Input DataFrame is empty. Cannot run tests.")
        return

    def get_category(strategy_name):
        for category, names in STRATEGY_MAP.items():
            if strategy_name in names:
                return category
        return 'other'
    df['Strategy Category'] = df['Strategy'].apply(get_category)
    
    hypothesis_results = []

    def run_ttest(h_id, description, group1, group2, alternative, group1_name, group2_name):
        print("-" * 50)
        print(f"Evaluating Hypothesis {h_id}: {description}")
        
        group1_size = group1.count()
        group2_size = group2.count()
        print(f"Sample size for Group 1 ('{group1_name}'): {group1_size}")
        print(f"Sample size for Group 2 ('{group2_name}'): {group2_size}")

        if group1_size < 2 or group2_size < 2:
            print("Result: INCONCLUSIVE. At least one group has fewer than 2 data points.")
            t_stat, p_value = np.nan, np.nan
            is_significant = "Inconclusive"
            conclusion = "Test could not be performed due to insufficient data (< 2 trades) in one or both categories."
        else:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                t_stat, p_value = stats.ttest_ind(
                    group1, group2, nan_policy='omit', equal_var=False, alternative=alternative
                )

            alpha = 0.05
            if p_value < alpha:
                is_significant = "Yes"
                conclusion = f"Data supports that mean return for '{group1_name}' is {'greater' if alternative=='greater' else 'less'} than for '{group2_name}'."
                print(f"Result: SIGNIFICANT (p < {alpha}). We reject the null hypothesis.")
            else:
                is_significant = "No"
                conclusion = "Fail to reject the null hypothesis."
                print(f"Result: NOT SIGNIFICANT (p >= {alpha}). We fail to reject the null hypothesis.")
            print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

        hypothesis_results.append({
            "Hypothesis ID": h_id,
            "Description": description,
            "Group 1": group1_name,
            "Group 1 Sample Size": group1_size,
            "Group 2": group2_name,
            "Group 2 Sample Size": group2_size,
            "T-statistic": t_stat,
            "P-value": p_value,
            "Significant (alpha=0.05)": is_significant,
            "Conclusion": conclusion
        })
        print("-" * 50)

    # --- Running tests with fully corrected logic ---
    breakout_df = df[df['Strategy Category'] == 'breakout']
    meanrev_df = df[df['Strategy Category'] == 'mean_reversion']
    momentum_df = df[df['Strategy Category'] == 'momentum']
    
    # H1a: Breakout strategies in medium/high vs. low volatility
    run_ttest("H1a", "Breakout strategies perform better in medium/high volatility than in low volatility.", 
              breakout_df[breakout_df['volatility_regime'].isin(['High', 'Medium'])]['Return'], 
              breakout_df[breakout_df['volatility_regime'] == 'Low']['Return'], 
              'greater', "Breakout (Medium/High Vol)", "Breakout (Low Vol)")

    # H1b: Breakout strategies in trending vs. ranging markets
    run_ttest("H1b", "Breakout strategies perform better in trending vs ranging markets.", 
              breakout_df[breakout_df['trend_regime'].isin(['Uptrend', 'Downtrend'])]['Return'], 
              breakout_df[breakout_df['trend_regime'] == 'Range']['Return'], 
              'greater', "Breakout (Trending)", "Breakout (Ranging)")

    # H1c: Mean-reversion strategies in low/medium vs. high volatility
    run_ttest("H1c", "Mean-reversion strategies perform better in low/medium volatility than in high volatility.", 
              meanrev_df[meanrev_df['volatility_regime'].isin(['Low', 'Medium'])]['Return'], 
              meanrev_df[meanrev_df['volatility_regime'] == 'High']['Return'], 
              'greater', "Mean-Reversion (Low/Medium Vol)", "Mean-Reversion (High Vol)")

    # H1d: Mean-reversion strategies in uptrending vs. ranging markets
    run_ttest("H1d", "Mean-reversion strategies perform worse in uptrends than in ranging markets.", 
              meanrev_df[meanrev_df['trend_regime'] == 'Uptrend']['Return'], 
              meanrev_df[meanrev_df['trend_regime'] == 'Range']['Return'], 
              'less', "Mean-Reversion (Uptrend)", "Mean-Reversion (Ranging)")

    # H1e: Momentum strategies in trending/high-volatility vs. ranging/low-volatility markets
    run_ttest("H1e", "Momentum strategies perform better in trending/high-volatility markets than in low-vol/ranging.", 
              momentum_df[(momentum_df['trend_regime'].isin(['Uptrend', 'Downtrend'])) & 
                          (momentum_df['volatility_regime'] == 'High')]['Return'], 
              momentum_df[(momentum_df['trend_regime'] == 'Range') & 
                          (momentum_df['volatility_regime'] == 'Low')]['Return'], 
              'greater', "Momentum (Trend/High-Vol)", "Momentum (Range/Low-Vol)")

    # Results to CSV
    results_df = pd.DataFrame(hypothesis_results)
    column_order = [
        "Hypothesis ID", "Description", "Group 1", "Group 1 Sample Size", 
        "Group 2", "Group 2 Sample Size", "T-statistic", "P-value", 
        "Significant (alpha=0.05)", "Conclusion"
    ]
    results_df = results_df[column_order]
    results_df.to_csv(HYPOTHESIS_RESULTS_FILE, index=False)
    print(f"\nHypothesis testing results successfully saved to: {HYPOTHESIS_RESULTS_FILE}")
    print("\n--- Final Results Summary ---")
    print(results_df)

In [19]:
if __name__ == '__main__':
    aggregated_df = aggregate_trade_data()
    if aggregated_df is not None:
        test_hypotheses(aggregated_df)
    else:
        print("\nProcess failed. Cannot run hypothesis tests.")

--- Part 1: Starting Data Aggregation ---
Aggregated file already exists at /home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/hypothesis_testing/trade_level_regime_analysis.csv. Loading it.

--- Part 2: Starting Hypothesis Testing ---
--------------------------------------------------
Evaluating Hypothesis H1a: Breakout strategies perform better in medium/high volatility than in low volatility.
Sample size for Group 1 ('Breakout (Medium/High Vol)'): 21267
Sample size for Group 2 ('Breakout (Low Vol)'): 4650
Result: NOT SIGNIFICANT (p >= 0.05). We fail to reject the null hypothesis.
T-statistic: 0.6910, P-value: 0.2448
--------------------------------------------------
--------------------------------------------------
Evaluating Hypothesis H1b: Breakout strategies perform better in trending vs ranging markets.
Sample size for Group 1 ('Breakout (Trending)'): 6837
Sample size for Group 2 ('Breakout (Ranging)'): 19080
Result: NOT SIGNIFICANT (p >= 0.05). We fail to reject the null hyp

# Hypothesis 2 - Volume as a Confirmation Signal
Incorporating volume-based parameters improves the profitability of intraday strategies compared to price-only baselines.

In [21]:
# 1. BASE DIRECTORY
HOME_DIR = Path.home()
BASE_DIR = HOME_DIR / "kohv04" / "backtesting_final"

# 2. DATA LOCATIONS
SIMULATION_RESULTS_DIR = os.path.join(BASE_DIR, "simulation_results")
# The summary file is the key data source
SUMMARY_FILE = os.path.join(SIMULATION_RESULTS_DIR, "simulation_summary.csv")

# 3. OUTPUT DIRECTORY
OUTPUT_DIR = os.path.join(BASE_DIR, "hypothesis_testing")
HYPOTHESIS_RESULTS_FILE = os.path.join(OUTPUT_DIR, "hypothesis_2.csv")

# 4. STRATEGY PAIRS TO COMPARE
STRATEGY_PAIRS = {
    "Baseline Breakout": "Volume-Enhanced Breakout",
    "Baseline Momentum": "Volume-Enhanced Momentum",
    "Baseline Bollinger Bands": "Volume-Enhanced VWAP Reversion"
}

# 5. METRICS TO TEST
# H1 direction: 'greater' means we expect the volume version to be higher.
#              'less' means we expect the volume version to be lower.
METRICS_TO_TEST = [
    ("Sharpe Ratio", 'greater', "Sharpe Ratio"),
    ("Profit Factor", 'greater', "Profit Factor"),
    ("Max Drawdown [%]", 'less', "Max Drawdown")
]

In [30]:
def test_hypothesis_2():
    """
    Evaluates Hypothesis 2 by performing paired t-tests between baseline
    and volume-enhanced strategies with robust data checking.
    """
    print("--- Part 1: Loading and Preparing Data ---")
    
    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        print(f"Output directory ensured at: {OUTPUT_DIR}")
    except Exception as e:
        print(f"FATAL ERROR: Could not create directory: {e}")
        return

    if not os.path.exists(SUMMARY_FILE):
        print(f"FATAL ERROR: The summary file was not found at {SUMMARY_FILE}")
        return
        
    try:
        df = pd.read_csv(SUMMARY_FILE)
        print("Successfully loaded simulation_summary.csv")
    except Exception as e:
        print(f"FATAL ERROR: Could not read summary file. Error: {e}")
        return

    # Upfront Data Cleaning
    print("\nCleaning and validating data types...")
    for col, _, _ in METRICS_TO_TEST:
        if col in df.columns:
            # Forcing column to be numeric, coercing any non-numeric values into NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            print(f"Warning: Metric column '{col}' not found in the summary file.")
    # Replacing any infinite values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("Data cleaning complete.")

    print("\n--- Part 2: Starting Paired T-Tests for Hypothesis 2 ---")
    
    all_results = []

    for baseline_strat, volume_strat in STRATEGY_PAIRS.items():
        print("="*60)
        print(f"Comparing: '{baseline_strat}' vs. '{volume_strat}'")
        print("="*60)

        pair_df = df[df['Strategy'].isin([baseline_strat, volume_strat])]
        
        for col_name, alternative, metric_name in METRICS_TO_TEST:
            
            try:
                pivoted = pair_df.pivot_table(index='Ticker', columns='Strategy', values=col_name)
                pivoted.dropna(inplace=True)
                
                group1 = pivoted[baseline_strat]
                group2 = pivoted[volume_strat]
                
                sample_size = len(pivoted)
                
                print(f"\n-- Testing Metric: {metric_name} --")
                print(f"Paired sample size (tickers with both results): {sample_size}")
                
                differences = group2 - group1
                print(f"\nDifferences (Volume - Baseline) summary:\n{differences.describe()}")
                
                std_diff = differences.std()
                if std_diff == 0:
                    print("\nCRITICAL WARNING: The standard deviation of the differences is zero. T-test is invalid.")
                print("--------------------\n")

                if sample_size < 2 or std_diff == 0:
                    print("Result: INCONCLUSIVE. Insufficient data or zero variance in differences.")
                    t_stat, p_value = np.nan, np.nan
                    is_significant = "Inconclusive"
                    conclusion = "Test invalid due to insufficient samples or zero variance in paired differences."
                else:
                    t_stat, p_value = stats.ttest_rel(group2, group1, alternative=alternative, nan_policy='omit')

                    alpha = 0.05
                    if p_value < alpha:
                        is_significant = "Yes"
                        conclusion = f"Data supports that '{volume_strat}' has a significantly {'better' if alternative != 'less' else 'lower'} {metric_name}."
                        print(f"Result: SIGNIFICANT (p < {alpha}). We reject the null hypothesis.")
                    else:
                        is_significant = "No"
                        conclusion = f"Fail to reject the null hypothesis for {metric_name}."
                        print(f"Result: NOT SIGNIFICANT (p >= {alpha}). We fail to reject the null hypothesis.")
                    
                    print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

            except KeyError:
                print(f"\n-- Testing Metric: {metric_name} --")
                print(f"ERROR: Could not find data for one of the strategies. Skipping.")
                t_stat, p_value, sample_size = np.nan, np.nan, 0
                is_significant = "Error"
                conclusion = "Data missing for one or both strategies in the pair."
            
            all_results.append({
                "Comparison": f"{baseline_strat} vs. {volume_strat}",
                "Metric": metric_name,
                "H1 Direction": f"Volume-Enhanced {('>' if alternative == 'greater' else '<')} Baseline",
                "Paired Samples (Tickers)": sample_size,
                "T-statistic": t_stat,
                "P-value": p_value,
                "Significant (alpha=0.05)": is_significant,
                "Conclusion": conclusion
            })

    print("\n--- Part 3: Saving Hypothesis 2 Results ---")
    if not all_results:
        print("No results were generated. Cannot save file.")
        return
        
    results_df = pd.DataFrame(all_results)
    column_order = [
        "Comparison", "Metric", "H1 Direction", "Paired Samples (Tickers)",
        "T-statistic", "P-value", "Significant (alpha=0.05)", "Conclusion"
    ]
    results_df = results_df[column_order]

    results_df.to_csv(HYPOTHESIS_RESULTS_FILE, index=False)
    print(f"\nHypothesis 2 testing results successfully saved to: {HYPOTHESIS_RESULTS_FILE}")
    print("\n--- Final Results Summary ---")
    print(results_df)


In [31]:
if __name__ == '__main__':
    test_hypothesis_2()

--- Part 1: Loading and Preparing Data ---
Output directory ensured at: /home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/hypothesis_testing
Successfully loaded simulation_summary.csv

Cleaning and validating data types...
Data cleaning complete.

--- Part 2: Starting Paired T-Tests for Hypothesis 2 ---
Comparing: 'Baseline Breakout' vs. 'Volume-Enhanced Breakout'

-- Testing Metric: Sharpe Ratio --
Paired sample size (tickers with both results): 100

Differences (Volume - Baseline) summary:
count    100.000000
mean       0.789416
std        8.686696
min      -17.895328
25%       -5.568098
50%        0.388636
75%        5.883303
max       21.452769
dtype: float64
--------------------

Result: NOT SIGNIFICANT (p >= 0.05). We fail to reject the null hypothesis.
T-statistic: 0.9088, P-value: 0.1828

-- Testing Metric: Profit Factor --
Paired sample size (tickers with both results): 99

Differences (Volume - Baseline) summary:
count    99.000000
mean     -0.039895
std       0.438805
min

# Hypothesis 3 - Transformer-Based Volume Prediction
Augmenting strategies with transformer-based forecasted short-term volume predictions increases performance,
achieving improved metrics.

In [32]:
# 1. BASE DIRECTORY
HOME_DIR = Path.home()
BASE_DIR = HOME_DIR / "kohv04" / "backtesting_final"

# 2. DATA LOCATIONS
SIMULATION_RESULTS_DIR = os.path.join(BASE_DIR, "simulation_results")
SUMMARY_FILE = os.path.join(SIMULATION_RESULTS_DIR, "simulation_summary.csv")

# 3. OUTPUT DIRECTORY
OUTPUT_DIR = os.path.join(BASE_DIR, "hypothesis_testing")
HYPOTHESIS_RESULTS_FILE = os.path.join(OUTPUT_DIR, "hypothesis_3.csv")

# 4. STRATEGY PAIRS TO COMPARE FOR H3
STRATEGY_PAIRS = {
    "Volume-Enhanced Breakout": "Deep Learning Breakout",
    "Volume-Enhanced Momentum": "Deep Learning Momentum",
    "Volume-Enhanced VWAP Reversion": "Deep Learning VWAP Reversion"
}

# 5. METRICS TO TEST
METRICS_TO_TEST = [
    ("Sharpe Ratio", 'greater', "Sharpe Ratio"),
    ("Profit Factor", 'greater', "Profit Factor"),
    ("Max Drawdown [%]", 'less', "Max Drawdown")
]

In [33]:
def test_hypothesis_3():
    """
    Evaluates Hypothesis 3 by performing paired t-tests between
    Volume-Enhanced and Deep Learning (Transformer) strategies.
    """
    print("--- Part 1: Loading and Preparing Data ---")
    
    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        print(f"Output directory ensured at: {OUTPUT_DIR}")
    except Exception as e:
        print(f"FATAL ERROR: Could not create directory: {e}")
        return

    if not os.path.exists(SUMMARY_FILE):
        print(f"FATAL ERROR: The summary file was not found at {SUMMARY_FILE}")
        return
        
    try:
        df = pd.read_csv(SUMMARY_FILE)
        print("Successfully loaded simulation_summary.csv")
    except Exception as e:
        print(f"FATAL ERROR: Could not read summary file. Error: {e}")
        return

    # Upfront Data Cleaning
    print("\nCleaning and validating data types...")
    for col, _, _ in METRICS_TO_TEST:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            print(f"Warning: Metric column '{col}' not found in the summary file.")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("Data cleaning complete.")

    print("\n--- Part 2: Starting Paired T-Tests for Hypothesis 3 ---")
    
    all_results = []

    for volume_strat, dl_strat in STRATEGY_PAIRS.items():
        print("="*60)
        print(f"Comparing: '{volume_strat}' vs. '{dl_strat}'")
        print("="*60)

        pair_df = df[df['Strategy'].isin([volume_strat, dl_strat])]
        
        for col_name, alternative, metric_name in METRICS_TO_TEST:
            
            try:
                pivoted = pair_df.pivot_table(index='Ticker', columns='Strategy', values=col_name)
                pivoted.dropna(inplace=True)
                
                group1 = pivoted[volume_strat]
                group2 = pivoted[dl_strat]
                
                sample_size = len(pivoted)
                
                print(f"\n-- Testing Metric: {metric_name} --")
                print(f"Paired sample size (tickers with both results): {sample_size}")

                if sample_size < 2:
                    is_significant = "Inconclusive"
                    conclusion = "Test could not be performed due to insufficient paired data."
                    t_stat, p_value = np.nan, np.nan
                else:
                    t_stat, p_value = stats.ttest_rel(group2, group1, alternative=alternative, nan_policy='omit')
                    alpha = 0.05
                    if p_value < alpha:
                        is_significant = "Yes"
                        conclusion = f"Data supports that '{dl_strat}' has a significantly {'better' if alternative != 'less' else 'lower'} {metric_name}."
                    else:
                        is_significant = "No"
                        conclusion = f"Fail to reject the null hypothesis for {metric_name}."
                    
                    print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

            except KeyError as e:
                print(f"ERROR: Could not find strategy column: {e}. Skipping.")
                t_stat, p_value, sample_size = np.nan, np.nan, 0
                is_significant = "Error"
                conclusion = f"Data missing for one or both strategies in the pair: {e}"
            
            all_results.append({
                "Comparison": f"{volume_strat} vs. {dl_strat}",
                "Metric": metric_name,
                "H1 Direction": f"Deep Learning {('>' if alternative == 'greater' else '<')} Volume-Enhanced",
                "Paired Samples (Tickers)": sample_size,
                "T-statistic": t_stat,
                "P-value": p_value,
                "Significant (alpha=0.05)": is_significant,
                "Conclusion": conclusion
            })

    # Saving Results
    print("\n--- Part 3: Saving Hypothesis 3 Results ---")
    if not all_results:
        print("No results were generated. Cannot save file.")
        return
        
    results_df = pd.DataFrame(all_results)
    column_order = [
        "Comparison", "Metric", "H1 Direction", "Paired Samples (Tickers)",
        "T-statistic", "P-value", "Significant (alpha=0.05)", "Conclusion"
    ]
    results_df = results_df[column_order]

    results_df.to_csv(HYPOTHESIS_RESULTS_FILE, index=False)
    print(f"\nHypothesis 3 testing results successfully saved to: {HYPOTHESIS_RESULTS_FILE}")
    print("\n--- Final Results Summary ---")
    print(results_df)

In [34]:
if __name__ == '__main__':
    test_hypothesis_3()

--- Part 1: Loading and Preparing Data ---
Output directory ensured at: /home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/hypothesis_testing
Successfully loaded simulation_summary.csv

Cleaning and validating data types...
Data cleaning complete.

--- Part 2: Starting Paired T-Tests for Hypothesis 3 ---
Comparing: 'Volume-Enhanced Breakout' vs. 'Deep Learning Breakout'

-- Testing Metric: Sharpe Ratio --
Paired sample size (tickers with both results): 100
T-statistic: -3.4948, P-value: 0.9996

-- Testing Metric: Profit Factor --
Paired sample size (tickers with both results): 99
T-statistic: 0.8192, P-value: 0.2073

-- Testing Metric: Max Drawdown --
Paired sample size (tickers with both results): 100
T-statistic: 7.6788, P-value: 1.0000
Comparing: 'Volume-Enhanced Momentum' vs. 'Deep Learning Momentum'

-- Testing Metric: Sharpe Ratio --
Paired sample size (tickers with both results): 101
T-statistic: 0.2923, P-value: 0.3853

-- Testing Metric: Profit Factor --
Paired sample size (