# Oversold Events: A Distributional Analysis on Bitcoin

This notebook investigates what historically happened after extreme RSI(14) oversold conditions on Bitcoin.

Instead of building a trading strategy with fixed entry and exit rules, the objective is different:

> Given that the market was truly oversold, what did the distribution of future returns look like?

---

# Import Tools & Define Config

In [None]:
%pip install -r requirements.txt

from datetime import datetime
import os

import polars as pl
import numpy as np
from talib import RSI
from scipy.stats import gaussian_kde, skew, kurtosis
import matplotlib.pyplot as plt
import yfinance as yf


FIG_DIR = "figures" # If you want to save the figures, set this to a directory, e.g. "figures"
DATA_DIR = "data" # If you want to save the data, set this to a directory, e.g. "data"
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

TICKER = "BTC-USD"
INTERVAL = "1d"
START_DATE = datetime(2016, 1, 1)
RSI_THRESHOLDS = [20, 30]
FORWARD_RETURN_DAYS = (31, 121)  # 31-120 day forward returns after oversold days
RSI_PERIOD = 14

# Download Data & Calculate RSI

In [None]:
print(f"Downloading {TICKER} market data from {START_DATE} to {datetime.now()} from Yahoo Finance...")
market_data = yf.download(
    TICKER, 
    start=START_DATE,
    end=datetime.now(),
    interval=INTERVAL,
    multi_level_index=False
)

if market_data is None or market_data.empty:
    raise ValueError(f"No market data found for {TICKER} from {START_DATE} to {datetime.now()}.")

market_data = market_data.reset_index()
market_data = market_data.rename(columns={"Date": "timestamp"})
market_data = pl.from_pandas(market_data) # Convert to Polars DataFrame for analysis

# Function Definitions for Analysis

In [None]:
def calculate_forward_returns(
    market_data: pl.DataFrame, 
    rsi_threshold: int, 
    rsi_period: int, 
    forward_return_days: tuple[int, int],
    save_data: bool = True
) -> pl.DataFrame:
    """
    Calculate forward returns for the oversold days and forward return days in the data frame.
    
    Args:
        market_data (pl.DataFrame): The market data containing 'Close' prices.
        rsi_threshold (int): The RSI threshold to identify oversold days.
        rsi_period (int): The period to calculate RSI.
        forward_return_days (tuple[int, int]): A tuple containing the start and end day for forward returns.
        save_data (bool): Whether to save the oversold days data to a CSV file.
    
    Returns:
        pl.DataFrame: A DataFrame with forward returns added as new columns.
    """
    print(f"Calculating RSI ({rsi_period} period) using talib...")
    rsi_values = RSI(market_data['Close'].to_numpy(), timeperiod=rsi_period)
    market_data = market_data.with_columns(pl.Series(f'RSI_{rsi_period}', rsi_values))

    # Filter the market data to include only the days where RSI is below the threshold
    extreme_oversold = market_data.filter(pl.col(f'RSI_{rsi_period}') < rsi_threshold).sort('timestamp')
    extreme_oversold = extreme_oversold.with_columns(
        pl.col("timestamp").dt.year().alias("year")
    )

    # Statistics about how many such days per year and save the details to a CSV file
    yearly_stats = extreme_oversold.group_by('year').agg(
        pl.count('timestamp').alias('count')
    ).sort('year')
    
    print(f"Total of {len(extreme_oversold)} days when RSI < {rsi_threshold}:")
    print("Yearly breakdown:")
    print(yearly_stats)
    
    if save_data:
        output_file = os.path.join(DATA_DIR, f'rsi_below_{rsi_threshold}.csv')
        extreme_oversold.select([
            'timestamp', 'Open', 'High', 'Low', 'Close', 'Volume', f'RSI_{rsi_period}'
        ]).write_csv(output_file)
        print(f"RSI < {rsi_threshold} days saved to: {output_file}")

    # Calculate forward returns for the oversold days
    market_data = market_data.with_row_index("idx")

    oversold_indices = (
        market_data
        .filter(pl.col(f'RSI_{rsi_period}') < rsi_threshold)
        .select("idx")
        .to_series()
        .to_list()
    )

    forward_start, forward_end = forward_return_days

    # Initialize the forward return column with NaN values
    forward_returns = [np.nan] * len(market_data)

    used_forward_days: set[int] = set()

    for oversold_idx in oversold_indices:
        oversold_close = market_data.filter(pl.col("idx") == oversold_idx)["Close"][0]

        start_idx = min(oversold_idx + forward_start, len(market_data))
        end_idx = min(oversold_idx + forward_end, len(market_data))

        for future_idx in range(start_idx, end_idx):
            if future_idx in used_forward_days:
                continue  # Skip if this future day has already been used for another oversold day (efficiency)

            future_close = market_data.filter(pl.col("idx") == future_idx)["Close"][0]
            return_val = (future_close / oversold_close - 1) * 100  # %
            forward_returns[future_idx] = return_val
            used_forward_days.add(future_idx)

    # Add the forward return column to the DataFrame
    market_data = market_data.with_columns(
        pl.Series(f"forward_returns_{rsi_threshold}", forward_returns)
    )

    return market_data


def distribution_analysis(
    market_data: pl.DataFrame,
    rsi_threshold: int,
    forward_return_days: tuple[int, int],
    asset_name: str = TICKER,
    start_date: datetime = START_DATE,
    save_plot: bool = True
) -> None:
    """
    Perform distribution analysis with KDE on the forward returns for the oversold days.
    
    Args:
        market_data (pl.DataFrame): The market data containing 'Close' prices and forward returns.
        rsi_threshold (int): The RSI threshold used to identify oversold days.
    """
    # Extract valid forward returns (non-NaN) for the given RSI threshold
    returns = (
        market_data
        .select(f"forward_returns_{rsi_threshold}")
        .drop_nans()
        .to_series()
        .to_numpy()
    )

    if len(returns) == 0:
        print(f"No forward return samples available for RSI threshold {rsi_threshold}.")
        return
    
    print("\n" + "="*60)
    print("Distribution Analysis")
    print("="*60)

    print(f"Total samples: {len(returns)}")
    print(f"Average return: {returns.mean():.2f}%")
    print(f"Median return: {np.median(returns):.2f}%")
    print(f"Std dev: {returns.std():.2f}%")
    print(f"Min: {returns.min():.2f}%")
    print(f"Max: {returns.max():.2f}%")

    kde_skew = skew(returns, bias=False)
    kde_kurtosis = kurtosis(returns, fisher=True, bias=False)
    print(f"Skew (KDE): {kde_skew:.4f}")
    print(f"Kurtosis (KDE, excess): {kde_kurtosis:.4f}")

    kde = gaussian_kde(returns)

    x_range = np.linspace(returns.min(), returns.max(), 1000)
    pdf_values = kde(x_range)

    # Plot
    plt.figure(figsize=(12, 6))

    plt.hist(
        returns,
        bins=50,
        density=True,
        alpha=0.3,
        label=f"Histogram ({len(returns)} samples)"
    )

    plt.plot(
        x_range,
        pdf_values,
        linewidth=2,
        label="KDE estimated PDF"
    )

    plt.xlabel("Return (%)")
    plt.ylabel("Density")
    plt.title(
        f"Distribution of {forward_return_days[0]}â€“{forward_return_days[1]-1} "
        f"day forward returns after RSI < {rsi_threshold}\n"
        f"({asset_name}, from {start_date.date()} to {datetime.now().date()})"
    )

    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()

    if save_plot:
        output_plot = os.path.join(FIG_DIR, (
            f"{asset_name}_rsi_below_{rsi_threshold}_"
            f"forward_returns_{forward_return_days[0]}_"
            f"{forward_return_days[1]-1}_distribution.svg"
        ))
        plt.savefig(output_plot, dpi=300, bbox_inches="tight")
        print(f"Plot saved to: {output_plot}")

    plt.show()

# Main Loop to Perform Analysis

In [None]:
for rsi_threshold in RSI_THRESHOLDS:
    market_data_with_returns = calculate_forward_returns(
        market_data, 
        rsi_threshold, 
        RSI_PERIOD, 
        FORWARD_RETURN_DAYS,
        save_data=True
    )

    distribution_analysis(
        market_data_with_returns,
        rsi_threshold,
        FORWARD_RETURN_DAYS,
        asset_name=TICKER,
        start_date=START_DATE,
        save_plot=True
    )