In [27]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import yfinance as yf

In [28]:
df = pd.read_parquet("../data/stock_data.parquet")

In [29]:
def create_lag_features(data, target_column, lag_steps=1):
    if isinstance(lag_steps, int):
        for i in range(1, lag_steps + 1):
            data[f"{target_column}_lag_{i}"] = data[target_column].shift(i)

    if isinstance(lag_steps, list):
        for i in lag_steps:
            data[f"{target_column}_lag_{i}"] = data[target_column].shift(i)

    return data


def create_rolling_mean(data, target_column, window_size=3):
    data[f"{target_column}_rolling_mean"] = data[target_column].rolling(window=window_size).mean()

    return data


def forward_fill_with_decay(df, column, decay_factor):
    """
    Forward fills missing values in a specified column with a decay factor applied.

    Parameters:
    - df: The DataFrame containing the data.
    - column: The column name (or index) to apply forward fill with decay.
    - decay_factor: The factor by which the previous value decays (0 < decay_factor < 1).

    Returns:
    - The DataFrame with missing values filled with decayed forward fill.
    """
    if not 0 < decay_factor < 1:
        raise ValueError("Decay factor must be between 0 and 1.")

    df = df.copy()  # Avoid modifying the original DataFrame
    df[column] = df[column].astype(float)  # Ensure column is of float type for NaNs

    # Initialize variables
    last_value = np.nan
    decay = 1.0

    for i in range(len(df)):
        if pd.notna(df.at[i, column]):
            # Update the last value and reset decay
            last_value = df.at[i, column]
            decay = 1.0
            df.at[i, column] = last_value * decay
            decay *= decay_factor  # Apply the decay factor for the next value

        elif pd.isna(df.at[i, column]) and pd.notna(last_value):
            # Apply decay to the last value and fill the current cell
            df.at[i, column] = last_value * decay
            decay *= decay_factor  # Apply the decay factor for the next value

    return df


def apply_event(data: pd.DataFrame, event: pd.DataFrame,  falloff: bool = True, decay_factor: float = 0.99) -> pd.DataFrame:
    """Join Events df to data df based on Date"""
    event["date"] = pd.to_datetime(event["date"])

    # Rename Events columns to event_* to avoid conflicts
    for column in event.columns:
        if column != "date":
            event.rename(columns={column: f"event_{column}"}, inplace=True)

    data = data.merge(event, on="date", how="left")

    data["event_decay"] = np.where(pd.notna(data["event_name"]), 1, np.nan)
    data["event_name"] = data["event_name"].ffill()
    data["event_sentiment"] = data["event_sentiment"].ffill()

    # forward fill sentiment with decay
    if falloff:
        data = forward_fill_with_decay(data, "event_decay", decay_factor)

    return data

In [30]:
create_lag_features(df, "open", lag_steps=list(range(1, 60)))

Unnamed: 0_level_0,open,high,low,close,adj close,volume,open_lag_1,open_lag_2,open_lag_3,open_lag_4,...,open_lag_50,open_lag_51,open_lag_52,open_lag_53,open_lag_54,open_lag_55,open_lag_56,open_lag_57,open_lag_58,open_lag_59
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-08-06,23.687500,23.870001,23.677500,23.740000,20.998871,154232000,,,,,...,,,,,,,,,,
2014-08-07,23.732500,23.987499,23.525000,23.620001,20.996645,186844000,23.687500,,,,...,,,,,,,,,,
2014-08-08,23.565001,23.705000,23.320000,23.684999,21.054426,167460000,23.732500,23.687500,,,...,,,,,,,,,,
2014-08-11,23.817499,24.020000,23.709999,23.997499,21.332224,146340000,23.565001,23.732500,23.687500,,...,,,,,,,,,,
2014-08-12,24.010000,24.219999,23.902500,23.992500,21.327776,135180000,23.817499,23.565001,23.732500,23.687500,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-31,221.440002,223.820007,220.630005,222.080002,222.080002,50036300,219.190002,216.960007,218.699997,218.929993,...,189.509995,190.470001,187.910004,187.509995,185.440002,184.899994,182.559998,182.850006,183.449997,182.350006
2024-08-01,224.369995,224.479996,217.020004,218.360001,218.360001,62501000,221.440002,219.190002,216.960007,218.699997,...,189.330002,189.509995,190.470001,187.910004,187.509995,185.440002,184.899994,182.559998,182.850006,183.449997
2024-08-02,219.149994,225.600006,217.710007,219.860001,219.860001,105568600,224.369995,221.440002,219.190002,216.960007,...,191.089996,189.330002,189.509995,190.470001,187.910004,187.509995,185.440002,184.899994,182.559998,182.850006
2024-08-05,199.089996,213.500000,196.000000,209.270004,209.270004,119548600,219.149994,224.369995,221.440002,219.190002,...,192.270004,191.089996,189.330002,189.509995,190.470001,187.910004,187.509995,185.440002,184.899994,182.559998


In [31]:
# apple= yf.Ticker("aapl")
# apple.actions["Stock Splits"] != 0
# apple.balance_sheet
# apple.dividends
# apple.cashflow
# apple.major_holders
# apple.news
# apple.recommendations
# apple.recommendations_summary
# apple.history(period="1y", interval="1h")

In [32]:
events = pd.read_parquet("../data/events.parquet")

In [33]:
df = apply_event(df, events, falloff=True, decay_factor=0.98)

In [34]:
df

Unnamed: 0,date,open,high,low,close,adj close,volume,open_lag_1,open_lag_2,open_lag_3,...,open_lag_53,open_lag_54,open_lag_55,open_lag_56,open_lag_57,open_lag_58,open_lag_59,event_name,event_sentiment,event_decay
0,2014-08-06,23.687500,23.870001,23.677500,23.740000,20.998871,154232000,,,,...,,,,,,,,,,
1,2014-08-07,23.732500,23.987499,23.525000,23.620001,20.996645,186844000,23.687500,,,...,,,,,,,,,,
2,2014-08-08,23.565001,23.705000,23.320000,23.684999,21.054426,167460000,23.732500,23.687500,,...,,,,,,,,,,
3,2014-08-11,23.817499,24.020000,23.709999,23.997499,21.332224,146340000,23.565001,23.732500,23.687500,...,,,,,,,,,,
4,2014-08-12,24.010000,24.219999,23.902500,23.992500,21.327776,135180000,23.817499,23.565001,23.732500,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,2024-07-31,221.440002,223.820007,220.630005,222.080002,222.080002,50036300,219.190002,216.960007,218.699997,...,187.509995,185.440002,184.899994,182.559998,182.850006,183.449997,182.350006,ALLTIMEHIGH,50.0,0.903921
2513,2024-08-01,224.369995,224.479996,217.020004,218.360001,218.360001,62501000,221.440002,219.190002,216.960007,...,187.910004,187.509995,185.440002,184.899994,182.559998,182.850006,183.449997,ALLTIMEHIGH,50.0,0.885842
2514,2024-08-02,219.149994,225.600006,217.710007,219.860001,219.860001,105568600,224.369995,221.440002,219.190002,...,190.470001,187.910004,187.509995,185.440002,184.899994,182.559998,182.850006,ALLTIMEHIGH,50.0,0.868126
2515,2024-08-05,199.089996,213.500000,196.000000,209.270004,209.270004,119548600,219.149994,224.369995,221.440002,...,189.509995,190.470001,187.910004,187.509995,185.440002,184.899994,182.559998,ALLTIMEHIGH,50.0,0.850763


In [35]:
df.to_parquet("..\data\dataset.parquet")

  df.to_parquet("..\data\dataset.parquet")
