In [73]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import yfinance as yf

In [74]:
df = pd.read_parquet("../data/stock_data.parquet")
events = pd.read_parquet("../data/events.parquet")
dividends = pd.read_parquet("../data/dividends.parquet")
splits = pd.read_parquet("../data/splits.parquet")
# df

In [75]:
def create_lag_features(data, target_column, lag_steps=1):
    if isinstance(lag_steps, int):
        for i in range(1, lag_steps + 1):
            data[f"{target_column}_lag_{i}"] = data[target_column].shift(i)

    if isinstance(lag_steps, list):
        for i in lag_steps:
            data[f"{target_column}_lag_{i}"] = data[target_column].shift(i)

    return data


def create_rolling_mean(data, target_column, window_size=3):
    data[f"{target_column}_rolling_mean"] = data[target_column].rolling(window=window_size).mean()

    return data


def forward_fill_with_decay(df, column, decay_factor):
    """
    Forward fills missing values in a specified column with a decay factor applied.

    Parameters:
    - df: The DataFrame containing the data.
    - column: The column name (or index) to apply forward fill with decay.
    - decay_factor: The factor by which the previous value decays (0 < decay_factor < 1).

    Returns:
    - The DataFrame with missing values filled with decayed forward fill.
    """
    if not 0 < decay_factor < 1:
        raise ValueError("Decay factor must be between 0 and 1.")

    df = df.copy()  # Avoid modifying the original DataFrame
    df[column] = df[column].astype(float)  # Ensure column is of float type for NaNs

    # Initialize variables
    last_value = np.nan
    decay = 1.0

    for i in range(len(df)):
        if pd.notna(df.at[i, column]):
            # Update the last value and reset decay
            last_value = df.at[i, column]
            decay = 1.0
            df.at[i, column] = last_value * decay
            decay *= decay_factor  # Apply the decay factor for the next value

        elif pd.isna(df.at[i, column]) and pd.notna(last_value):
            # Apply decay to the last value and fill the current cell
            df.at[i, column] = last_value * decay
            decay *= decay_factor  # Apply the decay factor for the next value

    return df


def apply_event(data: pd.DataFrame, event: pd.DataFrame, prefix_name, falloff: bool = True, decay_factor: float = 0.99) -> pd.DataFrame:
    """Join Events df to data df based on Date"""
    event = event.copy(deep=True)

    event["date"] = pd.to_datetime(event["date"])

    # Rename Events columns to event_* to avoid conflicts
    for column in event.columns:
        if column != "date":
            event.rename(columns={column: f"event_{column}"}, inplace=True)

    df = data.merge(event, on="date", how="left").copy(deep=True)

    df["event_decay"] = np.where(pd.notna(df["event_name"]), 1, np.nan)
    df["event_name"] = df["event_name"].ffill()
    df["event_value"] = df["event_value"].ffill()
    df["event_sentiment"] = df["event_sentiment"].ffill()

    df = df.rename(columns={
        "event_name": f"{prefix_name}_event_name",
        "event_value": f"{prefix_name}_event_value",
        "event_sentiment": f"{prefix_name}_event_sentiment",
        "event_decay": f"{prefix_name}_event_decay"
    })

    # forward fill sentiment with decay
    if falloff:
        df = forward_fill_with_decay(df, f"{prefix_name}_event_decay", decay_factor)

    return df


def day_of_week(data: pd.DataFrame) -> pd.DataFrame:
    data["day_of_week"] = data["date"].dt.dayofweek
    return data


def day_of_month(data: pd.DataFrame) -> pd.DataFrame:
    data["day_of_month"] = data["date"].dt.day
    return data


def month(data: pd.DataFrame) -> pd.DataFrame:
    data["month"] = data["date"].dt.month
    return data


def year(data: pd.DataFrame) -> pd.DataFrame:
    data["year"] = data["date"].dt.year
    return data


def week_of_year(data: pd.DataFrame) -> pd.DataFrame:
    data["week_of_year"] = data["date"].dt.isocalendar().week
    return data


def is_not_nan_column_and_default(data: pd.DataFrame, column) -> pd.DataFrame:
    if isinstance(column, list):
        for col in column:
            data[f"is_nan_{col}"] = data[col].notna().astype(int)
            data[col] = data[col].fillna(0)
        return data

    if isinstance(column, str):
        data[f"is_nan_{column}"] = data[column].notna().astype(int)
        data[column] = data[column].fillna(0)
        return data

In [76]:
df = apply_event(df, dividends, prefix_name="dividends", falloff=True, decay_factor=0.98)
df = apply_event(df, splits, prefix_name="splits", falloff=True, decay_factor=0.98)
df

Unnamed: 0,date,open,high,low,close,adj close,volume,dividends_event_value,dividends_event_name,dividends_event_sentiment,dividends_event_decay
0,2014-08-06,23.687500,23.870001,23.677500,23.740000,20.998871,154232000,,,,
1,2014-08-07,23.732500,23.987499,23.525000,23.620001,20.996645,186844000,0.1175,dividend,1.0,1.000000
2,2014-08-08,23.565001,23.705000,23.320000,23.684999,21.054426,167460000,0.1175,dividend,1.0,0.980000
3,2014-08-11,23.817499,24.020000,23.709999,23.997499,21.332214,146340000,0.1175,dividend,1.0,0.960400
4,2014-08-12,24.010000,24.219999,23.902500,23.992500,21.327770,135180000,0.1175,dividend,1.0,0.941192
...,...,...,...,...,...,...,...,...,...,...,...
2512,2024-07-31,221.440002,223.820007,220.630005,222.080002,222.080002,50036300,0.2500,dividend,1.0,0.329181
2513,2024-08-01,224.369995,224.479996,217.020004,218.360001,218.360001,62501000,0.2500,dividend,1.0,0.322597
2514,2024-08-02,219.149994,225.600006,217.710007,219.860001,219.860001,105568600,0.2500,dividend,1.0,0.316145
2515,2024-08-05,199.089996,213.500000,196.000000,209.270004,209.270004,119548600,0.2500,dividend,1.0,0.309822


In [77]:
df = is_not_nan_column_and_default(df, ["dividends_event_value", "dividends_event_sentiment"])
df = is_not_nan_column_and_default(df, ["splits_event_value", "splits_event_sentiment"])
# df = is_not_nan_column_and_default(df, "dividends_event_value")
df

Unnamed: 0,date,open,high,low,close,adj close,volume,dividends_event_value,dividends_event_name,dividends_event_sentiment,dividends_event_decay,is_nan_dividends_event_value,is_nan_dividends_event_sentiment
0,2014-08-06,23.687500,23.870001,23.677500,23.740000,20.998871,154232000,0.0000,,0.0,,0,0
1,2014-08-07,23.732500,23.987499,23.525000,23.620001,20.996645,186844000,0.1175,dividend,1.0,1.000000,1,1
2,2014-08-08,23.565001,23.705000,23.320000,23.684999,21.054426,167460000,0.1175,dividend,1.0,0.980000,1,1
3,2014-08-11,23.817499,24.020000,23.709999,23.997499,21.332214,146340000,0.1175,dividend,1.0,0.960400,1,1
4,2014-08-12,24.010000,24.219999,23.902500,23.992500,21.327770,135180000,0.1175,dividend,1.0,0.941192,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,2024-07-31,221.440002,223.820007,220.630005,222.080002,222.080002,50036300,0.2500,dividend,1.0,0.329181,1,1
2513,2024-08-01,224.369995,224.479996,217.020004,218.360001,218.360001,62501000,0.2500,dividend,1.0,0.322597,1,1
2514,2024-08-02,219.149994,225.600006,217.710007,219.860001,219.860001,105568600,0.2500,dividend,1.0,0.316145,1,1
2515,2024-08-05,199.089996,213.500000,196.000000,209.270004,209.270004,119548600,0.2500,dividend,1.0,0.309822,1,1


In [78]:
# create_lag_features(df, "open", lag_steps=list(range(1, 60)))
# create_lag_features(df, "high", lag_steps=list(range(1, 60)))
# create_lag_features(df, "low", lag_steps=list(range(1, 60)))
# create_lag_features(df, "volume", lag_steps=list(range(1, 60)))

# df = day_of_week(df)
# df = day_of_month(df)
# df = month(df)
# df = year(df)
# df = week_of_year(df)


# create_lag_features(df, "event_name", lag_steps=list(range(1, 60)))
# create_lag_features(df, "high", lag_steps=list(range(1, 60)))
# create_lag_features(df, "low", lag_steps=list(range(1, 60)))

In [79]:
df

Unnamed: 0,date,open,high,low,close,adj close,volume,dividends_event_value,dividends_event_name,dividends_event_sentiment,dividends_event_decay,is_nan_dividends_event_value,is_nan_dividends_event_sentiment
0,2014-08-06,23.687500,23.870001,23.677500,23.740000,20.998871,154232000,0.0000,,0.0,,0,0
1,2014-08-07,23.732500,23.987499,23.525000,23.620001,20.996645,186844000,0.1175,dividend,1.0,1.000000,1,1
2,2014-08-08,23.565001,23.705000,23.320000,23.684999,21.054426,167460000,0.1175,dividend,1.0,0.980000,1,1
3,2014-08-11,23.817499,24.020000,23.709999,23.997499,21.332214,146340000,0.1175,dividend,1.0,0.960400,1,1
4,2014-08-12,24.010000,24.219999,23.902500,23.992500,21.327770,135180000,0.1175,dividend,1.0,0.941192,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,2024-07-31,221.440002,223.820007,220.630005,222.080002,222.080002,50036300,0.2500,dividend,1.0,0.329181,1,1
2513,2024-08-01,224.369995,224.479996,217.020004,218.360001,218.360001,62501000,0.2500,dividend,1.0,0.322597,1,1
2514,2024-08-02,219.149994,225.600006,217.710007,219.860001,219.860001,105568600,0.2500,dividend,1.0,0.316145,1,1
2515,2024-08-05,199.089996,213.500000,196.000000,209.270004,209.270004,119548600,0.2500,dividend,1.0,0.309822,1,1


In [80]:
df.to_parquet("..\data\dataset.parquet")

  df.to_parquet("..\data\dataset.parquet")
