In [4]:
import pandas as pd
import numpy as np
import yfinance as yf

spy = yf.Ticker("SPY")
df = spy.history(period='max', interval='1d')[['Open', 'High', 'Low', 'Close']]

In [6]:
def triple_barrier_labeling(prices: pd.Series, volatility_window: int, upper_multiplier: float, lower_multiplier: float, time_horizon: int) -> pd.Series:
    """
    Applies the triple barrier labeling method to the given price series.

    Parameters:
    - prices: pd.Series of asset prices, indexed by date/time.
    - volatility_window: int, rolling window for volatility calculation (e.g., 20 for 20-day rolling volatility).
    - upper_multiplier: float, scale factor for the upper barrier relative to volatility.
    - lower_multiplier: float, scale factor for the lower barrier relative to volatility.
    - time_horizon: int, the time horizon (in periods) over which to evaluate barrier breaches.

    Returns:
    - labels: pd.Series of labels (-1, 0, 1), where:
        - 1 means the upper barrier was hit first (positive outcome),
        - -1 means the lower barrier was hit first (negative outcome),
        - 0 means no barrier was hit within the time horizon.
    """

    # Calculate rolling volatility
    log_returns = np.log(prices / prices.shift(1))
    volatility = log_returns.rolling(volatility_window).std()

    # Initialize labels as neutral (0)
    labels = pd.Series(0, index=prices.index)

    for i in range(len(prices)):
        if i + time_horizon >= len(prices):
            # Avoid index out of bounds for the last few data points
            break
        
        # Set the initial price
        initial_price = prices.iloc[i]
        
        # Define the dynamic barriers
        upper_barrier = initial_price * (1 + upper_multiplier * volatility.iloc[i])
        lower_barrier = initial_price * (1 - lower_multiplier * volatility.iloc[i])
        
        # Get the future price path for the next 'time_horizon' periods
        future_prices = prices[i:i + time_horizon]
        
        # Check if the upper or lower barrier is hit first
        for future_price in future_prices:
            if future_price >= upper_barrier:
                labels.iloc[i] = 1  # Upper barrier hit
                break
            elif future_price <= lower_barrier:
                labels.iloc[i] = -1  # Lower barrier hit
                break

    return labels

In [14]:
labels = triple_barrier_labeling(df['Close'], volatility_window=14, upper_multiplier=1, lower_multiplier=1, time_horizon=3)

In [16]:
import pandas as pd

def create_labels(df, threshold=0.01):
    """
    Create labels for daily price data based on returns and target thresholds.

    Parameters:
    df (pd.DataFrame): Daily price data with 'Open' and 'Close' columns.
    threshold (float): Percentage threshold for labeling and target calculation. Default is 0.01 (1%).

    Returns:
    pd.DataFrame: DataFrame with columns: 'bin', 'return', and 'target', indexed by date.
    """
    
    # Ensure the input df has the necessary columns
    if not {'Open', 'Close'}.issubset(df.columns):
        raise ValueError("DataFrame must contain 'Open' and 'Close' columns.")
    
    # Calculate daily returns from Open to Close
    df['return'] = (df['Close'] - df['Open']) / df['Open']
    
    # Create bin column based on threshold
    df['bin'] = df['return'].apply(lambda x: 1 if x > threshold else (-1 if x < -threshold else 0))
    
    # Calculate target values: [upper_target, lower_target] based on threshold
    df['target'] = df['Open'].apply(lambda x: [x * (1 + threshold), x * (1 - threshold)])
    
    # Select only the required columns
    return_data = df[['bin', 'return', 'target']]
    
    return return_data

# Example usage
# df is a DataFrame with 'Open', 'High', 'Low', 'Close' and a datetime index (daily frequency)
# result = create_labels(df, threshold=0.02)  # Use a 2% threshold

In [26]:
create_labels(df, threshold=0.015)['bin'].value_counts()

bin
 0    7217
-1     430
 1     331
Name: count, dtype: int64

In [27]:
import pandas as pd

def create_labels_with_volatility(df, multiplier=1):
    """
    Create labels based on returns and dynamic volatility threshold from hourly price data.

    Parameters:
    df (pd.DataFrame): Hourly price data with 'Open', 'High', 'Low', and 'Close' columns.
    multiplier (float): Multiplier to apply to the average intraday volatility (default 1).

    Returns:
    pd.DataFrame: DataFrame with columns: 'bin', 'return', and 'target', indexed by date.
    """
    
    # Ensure the input df has the necessary columns
    if not {'Open', 'High', 'Low', 'Close'}.issubset(df.columns):
        raise ValueError("DataFrame must contain 'Open', 'High', 'Low', and 'Close' columns.")
    
    # Resample the data to daily frequency, keeping the first Open, the highest High, lowest Low, and last Close
    daily_data = df.resample('D').agg({
        'Open': 'first', 
        'High': 'max', 
        'Low': 'min', 
        'Close': 'last'
    })
    
    # Calculate daily returns from Open to Close
    daily_data['return'] = (daily_data['Close'] - daily_data['Open']) / daily_data['Open']
    
    # Calculate intraday volatility (High - Low) for each day
    daily_data['intraday_volatility'] = daily_data['High'] - daily_data['Low']
    
    # Compute the rolling average volatility over the last 10 days
    daily_data['avg_volatility'] = daily_data['intraday_volatility'].rolling(window=10).mean()
    
    # Dynamic threshold based on the multiplier and the average volatility
    daily_data['threshold'] = daily_data['avg_volatility'] * multiplier
    
    # Create bin column based on the dynamic threshold
    daily_data['bin'] = daily_data.apply(
        lambda row: 1 if row['return'] > row['threshold'] / row['Open'] else 
                    (-1 if row['return'] < -row['threshold'] / row['Open'] else 0), axis=1)
    
    # Calculate target values: [upper_target, lower_target] based on dynamic threshold
    daily_data['target'] = daily_data.apply(
        lambda row: [row['Open'] * (1 + row['threshold'] / row['Open']), 
                     row['Open'] * (1 - row['threshold'] / row['Open'])], axis=1)
    
    # Select only the required columns
    return_data = daily_data[['bin', 'return', 'target']]
    
    return return_data

# Example usage
# df is a DataFrame with 'Open', 'High', 'Low', 'Close' columns and a datetime index (hourly frequency)
# result = create_labels_with_volatility(df, multiplier=1.5)  # Use a 1.5x multiplier of the average volatility

In [29]:
df = spy.history(period='2y', interval='1h')[['Open', 'High', 'Low', 'Close']]

In [31]:
create_labels_with_volatility(df)['bin'].value_counts()

bin
0    731
Name: count, dtype: int64