In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle
import yfinance as yf

In [2]:
def get_Data(ticker, start_date = "2005-01-01", end_date = "2025-01-19"):
    try:
        ticker_data = yf.Ticker(ticker)
        historical_data = ticker_data.history(interval="1d", start=start_date, end=end_date)
        return historical_data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")

In [3]:
# base model data
tickers = ["GLD", "GDX"]
data_base = {}
for ticker in tickers:
    print(f"Fetching data for {ticker}...")
    data = get_Data(ticker, start_date="2005-01-01", end_date="2025-01-19")
    data = data.reset_index()
    data_base[ticker] = data

Fetching data for GLD...
Fetching data for GDX...


In [None]:
#target data(ensemble)
target_stock = ["GLD"]
data_ensemble = {}
for ticker in target_stock:
    target_stock_data = get_Data(ticker, start_date="2005-01-01", end_date="2025-01-19") 
    target_stock_data = target_stock_data.reset_index()
    data_ensemble[ticker] = target_stock_data

In [None]:
import numpy as np
import pandas as pd

def calculate_rsi(df: pd.DataFrame, period: int = 14) -> pd.Series:
    """
    Calculate the Relative Strength Index (RSI) using the 'Close' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing at least a 'Close' column.
        period (int): Look-back period for the RSI (default: 14).
    
    Returns:
        pd.Series: RSI values.
    """
    close = df['Close']
    delta = close.diff()
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


def calculate_bollinger_band_width(df: pd.DataFrame, window: int = 20, num_std: float = 2) -> pd.Series:
    """
    Calculate the Bollinger Bands Width using the 'Close' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing at least a 'Close' column.
        window (int): Rolling window period (default: 20).
        num_std (float): Number of standard deviations for the bands (default: 2).
    
    Returns:
        pd.Series: Bollinger Bands width.
    """
    close = df['Close']
    ma = close.rolling(window=window).mean()
    std = close.rolling(window=window).std()
    upper_band = ma + num_std * std
    lower_band = ma - num_std * std
    bb_width = (upper_band - lower_band) / ma
    return bb_width


def calculate_adx(df: pd.DataFrame, window: int = 14) -> pd.Series:
    """
    Calculate the Average Directional Index (ADX) using the 'High', 'Low', and 'Close' columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'High', 'Low', and 'Close' columns.
        window (int): Look-back period (default: 14).
    
    Returns:
        pd.Series: ADX values.
    """
    high = df['High']
    low = df['Low']
    close = df['Close']
    
    high_low = high - low
    high_close = np.abs(high - close.shift())
    low_close = np.abs(low - close.shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0

    tr_sum = true_range.rolling(window).sum()
    plus_dm_sum = plus_dm.rolling(window).sum()
    minus_dm_sum = minus_dm.rolling(window).sum().abs()

    plus_di = 100 * (plus_dm_sum / tr_sum)
    minus_di = 100 * (minus_dm_sum / tr_sum)
    dx = 100 * np.abs(plus_di - minus_di) / (plus_di + minus_di)
    return dx


def calculate_volume_roc(df: pd.DataFrame, period: int = 20) -> pd.Series:
    """
    Calculate the Volume Rate of Change (ROC) using the 'Volume' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a 'Volume' column.
        period (int): Look-back period (default: 20).
    
    Returns:
        pd.Series: Volume ROC values (in percent).
    """
    volume = df['Volume']
    volume_shifted = volume.shift(period)
    roc = ((volume - volume_shifted) / volume_shifted) * 100
    return roc


def calculate_price_zscore(df: pd.DataFrame, window: int = 20) -> pd.Series:
    """
    Calculate the rolling Z-score of the price using the 'Close' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a 'Close' column.
        window (int): Rolling window period (default: 20).
    
    Returns:
        pd.Series: Z-score values.
    """
    close = df['Close']
    rolling_mean = close.rolling(window=window).mean()
    rolling_std = close.rolling(window=window).std()
    zscore = (close - rolling_mean) / rolling_std
    return zscore


def calculate_skewness(df: pd.DataFrame, window: int = 20) -> pd.Series:
    """
    Calculate the rolling skewness of the price using the 'Close' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a 'Close' column.
        window (int): Rolling window period (default: 20).
    
    Returns:
        pd.Series: Skewness values.
    """
    close = df['Close']
    return close.rolling(window=window).skew()


def calculate_macd(df: pd.DataFrame, fast_period: int = 12, slow_period: int = 26, signal_period: int = 9) -> pd.DataFrame:
    """
    Calculate the Moving Average Convergence Divergence (MACD) indicator using the 'Close' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a 'Close' column.
        fast_period (int): Period for the fast EMA (default: 12).
        slow_period (int): Period for the slow EMA (default: 26).
        signal_period (int): Period for the signal line EMA (default: 9).
    
    Returns:
        pd.DataFrame: A DataFrame with columns for MACD line, Signal line, and MACD Histogram.
    """
    close = df['Close']
    ema_fast = close.ewm(span=fast_period, adjust=False).mean()
    ema_slow = close.ewm(span=slow_period, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal_period, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return pd.DataFrame({
        'MACD': macd_line,
        'MACD_Signal': signal_line,
        'MACD_Hist': macd_hist
    })


def calculate_stochastic(df: pd.DataFrame, k_period: int = 14, d_period: int = 3) -> pd.DataFrame:
    """
    Calculate the Stochastic Oscillator using the 'High', 'Low', and 'Close' columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'High', 'Low', and 'Close' columns.
        k_period (int): Look-back period for %K (default: 14).
        d_period (int): Smoothing period for %D (default: 3).
    
    Returns:
        pd.DataFrame: A DataFrame with '%K' and '%D' columns.
    """
    high = df['High']
    low = df['Low']
    close = df['Close']
    lowest_low = low.rolling(window=k_period).min()
    highest_high = high.rolling(window=k_period).max()
    percent_k = 100 * (close - lowest_low) / (highest_high - lowest_low)
    percent_d = percent_k.rolling(window=d_period).mean()
    return pd.DataFrame({
        'Stochastic_%K': percent_k,
        'Stochastic_%D': percent_d
    })


def calculate_std(df: pd.DataFrame, window: int = 20) -> pd.Series:
    """
    Calculate the rolling standard deviation using the 'Close' column.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a 'Close' column.
        window (int): Rolling window period (default: 20).
    
    Returns:
        pd.Series: Rolling standard deviation.
    """
    close = df['Close']
    return close.rolling(window=window).std()


def calculate_fibonacci_retracement(df: pd.DataFrame,
                                    fib_levels: list = [0.236, 0.382, 0.5, 0.618, 0.786]) -> pd.DataFrame:
    """
    Calculate Fibonacci retracement levels based on the highest high and lowest low in the DataFrame.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'High' and 'Low' columns.
        fib_levels (list): List of Fibonacci levels (default: [0.236, 0.382, 0.5, 0.618, 0.786]).
    
    Returns:
        pd.DataFrame: A DataFrame with constant Fibonacci levels for each row.
    """
    swing_high = df['High'].max()
    swing_low = df['Low'].min()
    diff = swing_high - swing_low
    levels = {f'Fib_{int(level*100)}': swing_high - diff * level for level in fib_levels}
    fib_df = pd.DataFrame({key: [value] * len(df) for key, value in levels.items()}, index=df.index)
    return fib_df


In [None]:
def calculate_all_indicators(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a feature DataFrame by combining the original OHLCV (and any extra) columns
    with various technical indicators computed from the DataFrame.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing the necessary columns.
    
    Returns:
        pd.DataFrame: A DataFrame with the original data and the calculated indicators.
    """
    features = pd.DataFrame(index=df.index)
    
    # Copy essential OHLCV data
    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        if col in df.columns:
            features[col] = df[col]
    
    # Copy additional columns if available
    for col in ['Dividends', 'Stock Splits', 'Capital Gains']:
        if col in df.columns:
            features[col] = df[col]
    
    # Add technical indicators
    features['RSI_14'] = calculate_rsi(df, period=14)
    features['BB_Width_20'] = calculate_bollinger_band_width(df, window=20, num_std=2)
    features['ADX_14'] = calculate_adx(df, window=14)
    features['Volume_ROC_20'] = calculate_volume_roc(df, period=20)
    features['Price_Z_Score_20'] = calculate_price_zscore(df, window=20)
    features['Skewness_20'] = calculate_skewness(df, window=20)
    
    # Merge MACD and Stochastic indicators (returned as DataFrames)
    macd_df = calculate_macd(df)
    stochastic_df = calculate_stochastic(df)
    features = features.join(macd_df)
    features = features.join(stochastic_df)
    
    # Rolling standard deviation (e.g., of the 'Close' price)
    features['Std_20'] = calculate_std(df, window=20)
    
    # Fibonacci retracement levels (constant across time, useful for overlays)
    fib_df = calculate_fibonacci_retracement(df)
    features = features.join(fib_df)
    
    return features

In [None]:
def indicator_building(data: pd.DataFrame) -> pd.DataFrame:
    """
    Compute technical indicators and merge them with the original dataset.

    Parameters:
        data (pd.DataFrame): A DataFrame containing the original data. The DataFrame
                             must include the required columns (e.g., 'Open', 'High',
                             'Low', 'Close', 'Volume') for computing the technical indicators.

    Returns:
        pd.DataFrame: A new DataFrame that contains both the original data columns and
                      the additional technical indicator features.
    """
    features_df = calculate_all_indicators(data)
    return data.join(features_df)

In [None]:
# base model data indicators
for ticker in data_base:
    data_base[ticker] = indicator_building(data_base[ticker])
# ensemble model data indicators
for ticker in data_ensemble:
    data_ensemble[ticker] = indicator_building(data_ensemble[ticker])

In [None]:
def normalize_data(data):
    #the data would be a df from the raw data
    #we will normalize the data with MinMaxScaler
    #return the normalized data
    
    return data

In [None]:
def difference (data):
    #the data would be a df from the raw data
    #we will difference the data with the shift function
    #return the differenced data
    
    return data