In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

ticker_symbol = "AAPL"
ticker = yf.Ticker(ticker_symbol)

end_date = datetime.today()
start_date = end_date - timedelta(days= 3 * 365)

start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")

history: pd.DataFrame = ticker.history(start=start_date_str, end=end_date_str)

history = history[['Open', 'High', 'Low', 'Close', 'Volume']]

history


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-02-07 00:00:00-05:00,170.290743,171.364538,168.409128,169.108582,77251200
2022-02-08 00:00:00-05:00,169.177516,172.743721,168.881972,172.231445,74829200
2022-02-09 00:00:00-05:00,173.433350,174.024423,172.300434,173.659927,71285000
2022-02-10 00:00:00-05:00,171.551702,172.871782,169.000202,169.561722,90865900
2022-02-11 00:00:00-05:00,169.768641,170.507494,165.542396,166.133484,98670700
...,...,...,...,...,...
2025-01-28 00:00:00-05:00,230.850006,240.190002,230.809998,238.259995,75707600
2025-01-29 00:00:00-05:00,234.119995,239.860001,234.009995,239.360001,45486100
2025-01-30 00:00:00-05:00,238.669998,240.789993,237.210007,237.589996,55658300
2025-01-31 00:00:00-05:00,247.190002,247.190002,233.440002,236.000000,101075100


In [2]:
def add_features(data: pd.DataFrame,
                 window_short: int = 5,
                 window_long: int = 10,
                 rsi_period: int = 14,
                 mfi_period: int = 14) -> pd.DataFrame:
    """
    Adds various technical indicators to the stock data.
    
    Features Added:
    - Close_lag1: Previous day's Close price.
    - SMA_5: 5-day Simple Moving Average.
    - SMA_10: 10-day Simple Moving Average.
    - Daily_Return: Percentage change of the Close price.
    - Volatility_5: Rolling standard deviation of Daily_Return over 5 days.
    - Range_HL: Difference between the day's High and Low.
    - RSI: Relative Strength Index computed over rsi_period days.
    - MFI: Money Flow Index computed over mfi_period days.
    """
    df = data.copy()
    
    # If the index is Date and not a column, reset the index.
    if 'Date' not in df.columns:
        df = df.reset_index()
    
    # -----------------------------
    # Basic features (from earlier)
    # -----------------------------
    # 1. Previous day's Close price
    df['Close_lag1'] = df['Close'].shift(1)
    
    # 2. Simple Moving Averages
    df['SMA_5'] = df['Close'].rolling(window=window_short).mean()
    df['SMA_10'] = df['Close'].rolling(window=window_long).mean()
    
    # 3. Daily Return (% change)
    df['Daily_Return'] = df['Close'].pct_change()
    
    # 4. Rolling Volatility: Standard deviation of Daily_Return over short window
    df['Volatility_5'] = df['Daily_Return'].rolling(window=window_short).std()
    
    # 5. High-Low Range
    df['Range_HL'] = df['High'] - df['Low']
    
    # -----------------------------
    # RSI Calculation (Relative Strength Index)
    # -----------------------------
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = (-delta).clip(lower=0)
    
    # Calculate average gain and loss using a rolling window
    avg_gain = gain.rolling(window=rsi_period, min_periods=rsi_period).mean()
    avg_loss = loss.rolling(window=rsi_period, min_periods=rsi_period).mean()
    
    # Relative Strength (RS) and then RSI
    rs = avg_gain / avg_loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # -----------------------------
    # MFI Calculation (Money Flow Index)
    # -----------------------------
    # Typical Price (TP)
    tp = (df['High'] + df['Low'] + df['Close']) / 3.0
    # Raw Money Flow
    mf = tp * df['Volume']
    
    # Determine positive and negative money flow
    tp_diff = tp.diff()
    pos_mf = mf.where(tp_diff > 0, 0)
    neg_mf = mf.where(tp_diff < 0, 0).abs()
    
    # Rolling sums for positive and negative money flow
    pos_mf_sum = pos_mf.rolling(window=mfi_period, min_periods=mfi_period).sum()
    neg_mf_sum = neg_mf.rolling(window=mfi_period, min_periods=mfi_period).sum()
    
    # MFI calculation
    df['MFI'] = 100 - (100 / (1 + (pos_mf_sum / neg_mf_sum)))
    
    return df

In [3]:
history_with_features = add_features(history)

history_with_features.dropna(inplace=True)
history_with_features

Unnamed: 0,Date,Open,High,Low,Close,Volume,Close_lag1,SMA_5,SMA_10,Daily_Return,Volatility_5,Range_HL,RSI,MFI
14,2022-02-28 00:00:00-05:00,160.636377,162.961300,160.015736,162.665756,95056600,162.399811,160.991061,164.271565,0.001638,0.018738,2.945564,40.432923,33.114819
15,2022-03-01 00:00:00-05:00,162.252028,164.123797,159.562608,160.774323,83474400,162.665756,160.770392,163.712006,-0.011628,0.017645,4.561188,32.341290,26.593508
16,2022-03-02 00:00:00-05:00,161.946606,164.872463,160.528007,164.084351,79724800,160.774323,162.049088,163.098264,0.020588,0.013078,4.344456,36.050371,26.832261
17,2022-03-03 00:00:00-05:00,165.966020,166.399483,163.089422,163.759308,76678400,164.084351,162.736710,162.475659,-0.001981,0.012655,3.310061,40.503045,33.601292
18,2022-03-04 00:00:00-05:00,162.045139,163.089381,159.690663,160.744751,83737200,163.759308,162.405698,161.913142,-0.018408,0.014878,3.398719,41.059052,34.122700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2025-01-28 00:00:00-05:00,230.850006,240.190002,230.809998,238.259995,75707600,229.860001,227.678000,229.041998,0.036544,0.018964,9.380005,43.722049,43.539289
746,2025-01-29 00:00:00-05:00,234.119995,239.860001,234.009995,239.360001,45486100,238.259995,230.784000,229.649998,0.004617,0.019048,5.850006,47.259082,48.792798
747,2025-01-30 00:00:00-05:00,238.669998,240.789993,237.210007,237.589996,55658300,239.360001,233.569998,229.621999,-0.007395,0.020480,3.579987,45.203679,54.340699
748,2025-01-31 00:00:00-05:00,247.190002,247.190002,233.440002,236.000000,101075100,237.589996,236.213998,230.395999,-0.006692,0.021056,13.750000,49.132824,63.275303


In [4]:
import optuna
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

df = history_with_features.copy()
df['Target'] = df['Close'].shift(-1)

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

feature_cols = [
    'Open', 'High', 'Low', 'Close', 'Volume',
    'Close_lag1', 'SMA_5', 'SMA_10', 'Daily_Return',
    'Volatility_5', 'Range_HL', 'RSI', 'MFI'
]

X = df[feature_cols]
y = df['Target']

tscv = TimeSeriesSplit(n_splits=5)

def objective(trial):
    """
    Objective function for Optuna to optimize CatBoostRegressor hyperparameters.
    """
    iterations = trial.suggest_int("iterations", 100, 1000)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
    depth = trial.suggest_int("depth", 4, 8)

    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        random_seed=42,
        verbose=0
    )

    maes = []
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        mae = mean_absolute_error(y_val, y_pred)
        maes.append(mae)
    
    return np.mean(maes)  # Minimize MAE

study = optuna.create_study(direction="minimize")  
study.optimize(objective, n_trials=200)

best_params = study.best_params
print("Best Hyperparameters:", best_params)
print("Best CV Score (MAE):", study.best_value)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-02-04 09:06:23,152] A new study created in memory with name: no-name-c150100e-3f8f-4e38-8b01-de2be9c9d9b4
[I 2025-02-04 09:06:28,357] Trial 0 finished with value: 7.842672296558737 and parameters: {'iterations': 574, 'learning_rate': 0.08561984121141111, 'depth': 7}. Best is trial 0 with value: 7.842672296558737.
[I 2025-02-04 09:06:31,883] Trial 1 finished with value: 6.875676781994234 and parameters: {'iterations': 852, 'learning_rate': 0.060190002305152646, 'depth': 5}. Best is trial 1 with value: 6.875676781994234.
[I 2025-02-04 09:06:35,137] Trial 2 finished with value: 6.948343634153128 and parameters: {'iterations': 797, 'learning_rate': 0.03995038793603464, 'depth': 5}. Best is trial 1 with value: 6.875676781994234.
[I 2025-02-04 09:06:37,160] Trial 3 finished with value: 7.979062062300213 and parameters: {'iterations': 121, 'learning_rate': 0.06528901438134475, 'depth': 8}. Best is trial 1 with value: 6.875676781994234.

Best Hyperparameters: {'iterations': 438, 'learning_rate': 0.023732718931410737, 'depth': 4}
Best CV Score (MAE): 6.406389627486642


In [6]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

train_size = int(0.8 * len(X))
X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]
X_test  = X.iloc[train_size:]
y_test  = y.iloc[train_size:]

print("Using Best Hyperparameters for Forecasting:")
print(best_params)

best_model = CatBoostRegressor(**best_params, random_seed=42, verbose=0)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)


mae_value  = mean_absolute_error(y_test, y_pred)
mse_value  = mean_squared_error(y_test, y_pred)
rmse_value = np.sqrt(mse_value)
print(f"Test MAE:  {mae_value:.4f}")
print(f"Test RMSE: {rmse_value:.4f}")



Using Best Hyperparameters for Forecasting:
{'iterations': 438, 'learning_rate': 0.023732718931410737, 'depth': 4}
Test MAE:  21.0467
Test RMSE: 23.1973


In [7]:
import plotly.graph_objects as go

test_dates = df['Date'].iloc[train_size:].reset_index(drop=True)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=test_dates,
    y=y_test,
    mode='lines',
    name='Actual'
))

fig.add_trace(go.Scatter(
    x=test_dates,
    y=y_pred,
    mode='lines',
    name='Forecast'
))

fig.update_layout(
    title="Actual vs Forecasted Next Day Close Price",
    xaxis_title="Date",
    yaxis_title="Price",
    hovermode="x unified"
)

fig.show()