In [163]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import yfinance as yf

In [164]:
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [165]:
df = pd.read_parquet("../data/stock_data.parquet")
events = pd.read_parquet("../data/events.parquet")
dividends = pd.read_parquet("../data/dividends.parquet")
splits = pd.read_parquet("../data/splits.parquet")
# df

In [166]:

def fib_retracement(data: pd.DataFrame, high: str, low: str, target_column: str, levels: list = [0.236, 0.382, 0.618, 1.0]):
    for level in levels:
        data[f"{target_column}_fib_{level}"] = data[high] - (data[high] - data[low]) * level

    return data


def create_lag_features(data: pd.DataFrame, target_column, lag_steps=1):
    if isinstance(lag_steps, int):
        for i in range(1, lag_steps + 1):
            data[f"{target_column}_lag_{i}"] = data[target_column].shift(i)

    if isinstance(lag_steps, list):
        for i in lag_steps:
            data[f"{target_column}_lag_{i}"] = data[target_column].shift(i)

    return data


def simple_moving_average(data: pd.DataFrame, target_column: str | list, window_size: int | list = 3):
    if isinstance(target_column, list) and isinstance(window_size, list):
        for col in target_column:
            for window in window_size:
                data[f"{col}_rolling_mean_{window}"] = data[col].rolling(window=window).mean()

    elif isinstance(target_column, str):
        data[f"{target_column}_rolling_mean_{window_size}"] = data[target_column].rolling(window=window_size).mean()

    elif isinstance(target_column, list):
        for col in target_column:
            data[f"{col}_rolling_mean_{window_size}"] = data[col].rolling(window=window_size).mean()

    return data


def exponential_moving_average(data: pd.DataFrame, target_column: str | list, window_size: int | list = 3):
    if isinstance(target_column, str):
        target_column = [target_column]
    if isinstance(window_size, int):
        window_size = [window_size]

    for col in target_column:
        for window in window_size:
            data[f"{col}_rolling_exp_mean_{window}"] = data[col].ewm(span=window).mean()

    return data


def bollinger_bands(data: pd.DataFrame, target_column: str, window_size: int = 20):
    data[f"{target_column}_rolling_mean_{window_size}"] = data[target_column].rolling(window=window_size).mean()
    data[f"{target_column}_rolling_std_{window_size}"] = data[target_column].rolling(window=window_size).std()
    data[f"{target_column}_bollinger_upper_{window_size}"] = data[f"{target_column}_rolling_mean_{window_size}"] + \
        2 * data[f"{target_column}_rolling_std_{window_size}"]
    data[f"{target_column}_bollinger_lower_{window_size}"] = data[f"{target_column}_rolling_mean_{window_size}"] - \
        2 * data[f"{target_column}_rolling_std_{window_size}"]

    return data


def greater_than(data: pd.DataFrame, src_column: str, target_column: str):
    """
    Check if the src_column is greater then target_column
    Return 1 if src_column is greater then target_column and 0 otherwise
    """
    data[f"{src_column}_greater_than_{target_column}"] = np.where(data[src_column] > data[target_column], 1, 0)

    return data


# def plot_bollinger_bands(data: pd.DataFrame, target_column: str, window_size: int = 20):
#     data[f"{target_column}_rolling_mean_{window_size}"] = data[target_column].rolling(window=window_size).mean()
#     data[f"{target_column}_rolling_std_{window_size}"] = data[target_column].rolling(window=window_size).std()
#     data[f"{target_column}_bollinger_upper_{window_size}"] = data[f"{target_column}_rolling_mean_{window_size}"] + 2 * data[
#         f"{target_column}_rolling_std_{window_size}"]
#     data[f"{target_column}_bollinger_lower_{window_size}"] = data[f"{target_column}_rolling_mean_{window_size}"] - 2 * data[
#         f"{target_column}_rolling_std_{window_size}"]

#     plt.figure(figsize=(12, 6))
#     plt.plot(data[target_column], label="Close Price", color="blue")
#     plt.plot(data[f"{target_column}_rolling_mean_{window_size}"], label="Rolling Mean", color="red")
#     plt.plot(data[f"{target_column}_bollinger_upper_{window_size}"], label="Bollinger Upper", color="green")
#     plt.plot(data[f"{target_column}_bollinger_lower_{window_size}"], label="Bollinger Lower", color="green")
#     plt.title(f"{target_column} Bollinger Bands")
#     plt.legend()
#     plt.show()

#     return data


def average_true_range(data: pd.DataFrame, window_size: int = 14):
    data[f"high_low_{window_size}"] = data["high"] - data["low"]
    data[f"high_close_{window_size}"] = np.abs(data["high"] - data["close"].shift(1))
    data[f"low_close_{window_size}"] = np.abs(data["low"] - data["close"].shift(1))
    data[f"true_range_{window_size}"] = np.max(
        [data[f"high_low_{window_size}"], data[f"high_close_{window_size}"], data[f"low_close_{window_size}"]], axis=0)
    data[f"average_true_range_{window_size}"] = data[f"true_range_{window_size}"].rolling(window=window_size).mean()

    return data


def relative_strength_index(data: pd.DataFrame, column: str = "close", window_size: int = 14):
    delta = data[column].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window_size).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window_size).mean()
    rs = gain / loss
    data[f"rsi_{window_size}"] = 100 - (100 / (1 + rs))

    return data


def macd(data: pd.DataFrame, column: str = "close", short_window: int = 12, long_window: int = 26, signal_window: int = 9):
    data[f"short_ema_{short_window}"] = data[column].ewm(span=short_window, adjust=False).mean()
    data[f"long_ema_{long_window}"] = data[column].ewm(span=long_window, adjust=False).mean()
    data[f"macd_{short_window}_{long_window}"] = data[f"short_ema_{short_window}"] - data[f"long_ema_{long_window}"]
    data[f"signal_{signal_window}"] = data[f"macd_{short_window}_{
        long_window}"].ewm(span=signal_window, adjust=False).mean()
    data[f"macd_hist_{short_window}_{long_window}_{signal_window}"] = data[f"macd_{
        short_window}_{long_window}"] - data[f"signal_{signal_window}"]

    return data


def stochastic_oscillator(data: pd.DataFrame, window_size: int = 14):
    data[f"stochastic_oscillator_{window_size}"] = (data["close"] - data["low"].rolling(window=window_size).min()) / (
        data["high"].rolling(window=window_size).max() - data["low"].rolling(window=window_size).min())

    return data


def williams_r(data: pd.DataFrame, window_size: int = 14):
    data[f"williams_r_{window_size}"] = (data["high"].rolling(window=window_size).max() - data["close"]) / (
        data["high"].rolling(window=window_size).max() - data["low"].rolling(window=window_size).min())

    return data


def money_flow_index(data: pd.DataFrame, window_size: int = 14):
    typical_price = (data["high"] + data["low"] + data["close"]) / 3
    raw_money_flow = typical_price * data["volume"]
    money_flow_ratio = raw_money_flow.rolling(window=window_size).sum() / (typical_price * data["volume"]).rolling(
        window=window_size).sum()
    data[f"money_flow_index_{window_size}"] = 100 - (100 / (1 + money_flow_ratio))

    return data


def on_balance_volume(data: pd.DataFrame, offset: int = 1):
    data[f"on_balance_volume_{offset}"] = np.where(
        data["close"] > data["close"].shift(offset), data["volume"], -data["volume"])
    data[f"on_balance_volume_{offset}"] = data[f"on_balance_volume_{offset}"].cumsum()

    return data


def volume_weighted_average_price(data: pd.DataFrame, window_size: int = 14):
    data[f"volume_weighted_average_price_{window_size}"] = (
        data["close"] * data["volume"]
    ).rolling(window=window_size).sum() / data["volume"].rolling(window=window_size).sum()

    return data


def volitility(data: pd.DataFrame, column: str = "close", window_size: int = 14):
    data[f"volitility_{column}_{window_size}"] = data[column].rolling(window=window_size).std()
    data[f"volitility_pct_change_{window_size}_{column}"] = data[column].pct_change().rolling(window=window_size).std()

    return data


def apply_technical_indicators(data: pd.DataFrame, target_column: str | list = "close", window_size: int | list = 14):

    if isinstance(target_column, str):
        target_column = [target_column]
    if isinstance(window_size, int):
        window_size = [window_size]

    for col in target_column:
        for window in window_size:
            data = volitility(data, col, window)
            data = average_true_range(data, window)
            data = simple_moving_average(data, col, window)
            data = exponential_moving_average(data, col, window)
            data = relative_strength_index(data, col, window)
            data = macd(data, col)
            data = stochastic_oscillator(data, window)
            data = williams_r(data, window)
            data = money_flow_index(data, window)
            data = on_balance_volume(data)
            data = volume_weighted_average_price(data, window)
            data = bollinger_bands(data, col, window)
            # data = fib_retracement(data, "high", "low", col)

    return data


def forward_fill_with_decay(df: pd.DataFrame, column, decay_factor):
    """
    Forward fills missing values in a specified column with a decay factor applied.

    Parameters:
    - df: The DataFrame containing the data.
    - column: The column name (or index) to apply forward fill with decay.
    - decay_factor: The factor by which the previous value decays (0 < decay_factor < 1).

    Returns:
    - The DataFrame with missing values filled with decayed forward fill.
    """
    if not 0 < decay_factor < 1:
        raise ValueError("Decay factor must be between 0 and 1.")

    df = df.copy()  # Avoid modifying the original DataFrame
    df[column] = df[column].astype(float)  # Ensure column is of float type for NaNs

    # Initialize variables
    last_value = np.nan
    decay = 1.0

    for i in range(len(df)):
        if pd.notna(df.at[i, column]):
            # Update the last value and reset decay
            last_value = df.at[i, column]
            decay = 1.0
            df.at[i, column] = last_value * decay
            decay *= decay_factor  # Apply the decay factor for the next value

        elif pd.isna(df.at[i, column]) and pd.notna(last_value):
            # Apply decay to the last value and fill the current cell
            df.at[i, column] = last_value * decay
            decay *= decay_factor  # Apply the decay factor for the next value

    return df


def apply_event(data: pd.DataFrame, event: pd.DataFrame, prefix_name, falloff: bool = True, decay_factor: float = 0.99) -> pd.DataFrame:
    """Join Events df to data df based on Date"""
    event = event.copy(deep=True)

    event["date"] = pd.to_datetime(event["date"])

    # Rename Events columns to event_* to avoid conflicts
    for column in event.columns:
        if column != "date":
            event.rename(columns={column: f"event_{column}"}, inplace=True)

    df = data.merge(event, on="date", how="left").copy(deep=True)

    df["event_decay"] = np.where(pd.notna(df["event_name"]), 1, np.nan)
    df["event_name"] = df["event_name"].ffill().infer_objects(copy=False)
    df["event_value"] = df["event_value"].ffill().infer_objects(copy=False)
    df["event_sentiment"] = df["event_sentiment"].ffill().infer_objects(copy=False)

    df = df.rename(columns={
        "event_name": f"{prefix_name}_event_name",
        "event_value": f"{prefix_name}_event_value",
        "event_sentiment": f"{prefix_name}_event_sentiment",
        "event_decay": f"{prefix_name}_event_decay"
    })

    # forward fill sentiment with decay
    if falloff:
        df = forward_fill_with_decay(df, f"{prefix_name}_event_decay", decay_factor)

    return df


def day_of_week(data: pd.DataFrame) -> pd.DataFrame:
    data["day_of_week"] = data["date"].dt.dayofweek.astype(int)
    return data


def day_of_month(data: pd.DataFrame) -> pd.DataFrame:
    data["day_of_month"] = data["date"].dt.day.astype(int)
    return data


def month(data: pd.DataFrame) -> pd.DataFrame:
    data["month"] = data["date"].dt.month.astype(int)
    return data


def year(data: pd.DataFrame) -> pd.DataFrame:
    data["year"] = data["date"].dt.year.astype(int)
    return data


def week_of_year(data: pd.DataFrame) -> pd.DataFrame:
    data["week_of_year"] = data["date"].dt.isocalendar().week.astype(int)
    return data


def is_not_nan_column_and_default(data: pd.DataFrame, column) -> pd.DataFrame:
    if isinstance(column, list):
        for col in column:
            data[f"is_nan_{col}"] = data[col].notna().astype(int)
            data[col] = data[col].fillna(0)
        return data

    if isinstance(column, str):
        data[f"is_nan_{column}"] = data[column].notna().astype(int)
        data[column] = data[column].fillna(0)
        return data


def future_value(data: pd.DataFrame, column: str, offset: int, drop_recent: bool = True) -> pd.DataFrame:
    data[f"target_{column}"] = data[column].shift(-offset)
    # Drop last X offset rows
    if drop_recent:  # Drop to remove NaNs
        data = drop_recent_with_offset(data, offset)

    return data


def drop_recent_with_offset(data: pd.DataFrame, offset: int) -> pd.DataFrame:
    return data[:-offset]


def percent_change_between_columns(data: pd.DataFrame, column: str, column2: str) -> pd.DataFrame:
    data[f"percent_change_{column}"] = (data[column2] - data[column]) / data[column]
    return data


def percent_change(data: pd.DataFrame, column: str, offset: int | list = 1, drop_recent: bool = True) -> pd.DataFrame:
    if isinstance(offset, int):
        offset = [offset]

    for off in offset:
        data[f"{column}_percent_change_{off}"] = (data[column].shift(-off) - data[column]) / data[column]

    if drop_recent:  # Drop to remove NaNs
        data = drop_recent_with_offset(data, max(offset))

    return data


def up_down(data: pd.DataFrame, column: str, offset: int, drop_recent: bool = True) -> pd.DataFrame:
    """
    Simply tells wheither the price went up or down in the last window_size days
        Returns 1 if the price went up, 0 if the price went down, and 0 if the price stayed the same
    """
    if isinstance(offset, int):
        offset = [offset]

    for off in offset:
        data[f"up_down_{column}_{off}"] = np.where(
            data[column].shift(-off) > data[column],
            1,
            np.where(data[column].shift(-off) < data[column], 0, 0)
        )

    if drop_recent:  # Drop to remove NaNs
        data = drop_recent_with_offset(data, max(offset))

    return data


def create_event_for_crossing(data: pd.DataFrame,  src_column: str, target_column: str, decay_factor: float = 0.98) -> pd.DataFrame:
    """
    Create an event for when the target_column crosses the src_column.
    If the target_column crosses the src_column from below to above, the apply_event

    create_event_for_crossing(df, "close_bollinger_lower_30", "close") will create an event for when the close price crosses the lower bollinger band

    IF you want to create an event for when the close price crosses the upper bollinger band, you can use create_event_for_crossing(df, "close", "close_bollinger_upper_30")

    The src column should be the lower value and the target column should be the higher value

    """
    # Create a Dataframe of the events. The Event schema needs to be in this format [date, name, value, sentiment]
    events = pd.DataFrame(columns=["date", "name", "value", "sentiment"])
    event_name = f"crossed_{src_column}_{target_column}"

    # Loop through the data and check for the crossing. If the target_column crosses the src_column from below to above, add an event
    for i in range(1, len(data)):
        if data[src_column][i] > data[target_column][i] and data[src_column][i-1] < data[target_column][i-1]:

            temp_df = pd.DataFrame([[data["date"][i], "crossed", 1, 1]], columns=events.columns)
            events = pd.concat([events, temp_df], axis=0)

    data = apply_event(data, events, event_name, decay_factor=decay_factor)

    return data


def adx(data: pd.DataFrame, period: int = 14) -> pd.DataFrame:
    """
    Calculate Average Directional Index (ADX)
    """

    data["high_low"] = data["high"] - data["low"]
    data["high_close"] = abs(data["high"] - data["close"].shift())
    data["low_close"] = abs(data["low"] - data["close"].shift())
    data["tr"] = data[["high_low", "high_close", "low_close"]].max(axis=1)

    # Calculate Directional Movement
    data["up_move"] = data["high"] - data["high"].shift()
    data["down_move"] = data["low"].shift() - data["low"]

    data["plus_dm"] = np.where((data["up_move"] > data["down_move"]) & (data["up_move"] > 0), data["up_move"], 0)
    data["minus_dm"] = np.where((data["down_move"] > data["up_move"]) & (data["down_move"] > 0), data["down_move"], 0)

    # Calculate ADX
    data["+di"] = 100 * (data["plus_dm"].ewm(span=period).mean() / data["tr"].ewm(span=period).mean())
    data["-di"] = 100 * (data["minus_dm"].ewm(span=period).mean() / data["tr"].ewm(span=period).mean())
    data["dx"] = 100 * abs(data["+di"] - data["-di"]) / (data["+di"] + data["-di"])
    data["adx"] = data["dx"].ewm(span=period).mean()

    return data

def calculate_hurst_exponent(data: pd.DataFrame, column: str, max_lag: int = 20, window_size: int = 100) -> pd.DataFrame:
    """Calculate the Hurst exponent for a given column in the DataFrame using a rolling window."""
    def hurst_exponent(ts):
        lags = range(2, max_lag)
        tau = [np.std(np.subtract(ts[lag:], ts[:-lag])) for lag in lags]
        hurst = np.polyfit(np.log(lags), np.log(tau), 1)[0] * 2.0
        return hurst

    data[f"hurst_exponent_{column}_lag_{max_lag}_window_{window_size}"] = data[column].rolling(window=window_size).apply(hurst_exponent, raw=True)
    return data






In [167]:
# # df = adx(df, 14)
# df = calculate_hurst_exponent(df, "close")
# df = calculate_hurst_exponent(df, "open")
df

Unnamed: 0,date,open,high,low,close,adj close,volume
0,2004-09-07,0.632143,0.646250,0.629107,0.638571,0.539257,301957600
1,2004-09-08,0.637500,0.653036,0.637143,0.649107,0.548155,343526400
2,2004-09-09,0.644643,0.648214,0.630000,0.637500,0.538353,461339200
3,2004-09-10,0.636786,0.646964,0.633214,0.640536,0.540917,328014400
4,2004-09-13,0.640714,0.644107,0.630714,0.635536,0.536694,281976800
...,...,...,...,...,...,...,...
5029,2024-08-29,230.100006,232.919998,228.880005,229.789993,229.789993,51906300
5030,2024-08-30,230.190002,230.399994,227.479996,229.000000,229.000000,52990800
5031,2024-09-03,228.550003,229.000000,221.169998,222.770004,222.770004,50042900
5032,2024-09-04,221.660004,221.779999,217.479996,220.850006,220.850006,43840200


In [168]:
df = up_down(df, "close", 1)
# df

In [169]:
df = apply_technical_indicators(df, ["open", "close", "high", "low", "volume"], [7, 14, 30])
# df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [170]:
print(list(filter(lambda x : x.startswith("high_low") , list(df.columns))))

['high_low_7', 'high_low_14', 'high_low_30']


In [171]:
df = create_event_for_crossing(df, "close_bollinger_lower_30", "close")
df = create_event_for_crossing(df, "close", "close_bollinger_upper_30")

# df = create_event_for_crossing(df, "signal_9", "macd_12_26")

df


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behav

Unnamed: 0,date,open,high,low,close,adj close,volume,up_down_close_1,volitility_open_7,volitility_pct_change_7_open,...,volume_bollinger_upper_30,volume_bollinger_lower_30,crossed_close_bollinger_lower_30_close_event_name,crossed_close_bollinger_lower_30_close_event_value,crossed_close_bollinger_lower_30_close_event_sentiment,crossed_close_bollinger_lower_30_close_event_decay,crossed_close_close_bollinger_upper_30_event_name,crossed_close_close_bollinger_upper_30_event_value,crossed_close_close_bollinger_upper_30_event_sentiment,crossed_close_close_bollinger_upper_30_event_decay
0,2004-09-07,0.632143,0.646250,0.629107,0.638571,0.539257,301957600,1,,,...,,,,,,,,,,
1,2004-09-08,0.637500,0.653036,0.637143,0.649107,0.548155,343526400,0,,,...,,,,,,,,,,
2,2004-09-09,0.644643,0.648214,0.630000,0.637500,0.538353,461339200,1,,,...,,,,,,,,,,
3,2004-09-10,0.636786,0.646964,0.633214,0.640536,0.540917,328014400,0,,,...,,,,,,,,,,
4,2004-09-13,0.640714,0.644107,0.630714,0.635536,0.536694,281976800,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5028,2024-08-28,227.919998,229.860001,225.679993,226.490005,226.490005,38052200,1,0.923404,0.006101,...,8.983124e+07,1.042743e+07,crossed,1.0,1.0,0.162311,crossed,1.0,1.0,0.335899
5029,2024-08-29,230.100006,232.919998,228.880005,229.789993,229.789993,51906300,0,1.512537,0.006784,...,8.891226e+07,1.040453e+07,crossed,1.0,1.0,0.159064,crossed,1.0,1.0,0.329181
5030,2024-08-30,230.190002,230.399994,227.479996,229.000000,229.000000,52990800,0,1.821776,0.006832,...,8.905843e+07,1.051431e+07,crossed,1.0,1.0,0.155883,crossed,1.0,1.0,0.322597
5031,2024-09-03,228.550003,229.000000,221.169998,222.770004,222.770004,50042900,0,1.845363,0.007475,...,8.911530e+07,1.058018e+07,crossed,1.0,1.0,0.152765,crossed,1.0,1.0,0.316145


In [172]:
df

Unnamed: 0,date,open,high,low,close,adj close,volume,up_down_close_1,volitility_open_7,volitility_pct_change_7_open,...,volume_bollinger_upper_30,volume_bollinger_lower_30,crossed_close_bollinger_lower_30_close_event_name,crossed_close_bollinger_lower_30_close_event_value,crossed_close_bollinger_lower_30_close_event_sentiment,crossed_close_bollinger_lower_30_close_event_decay,crossed_close_close_bollinger_upper_30_event_name,crossed_close_close_bollinger_upper_30_event_value,crossed_close_close_bollinger_upper_30_event_sentiment,crossed_close_close_bollinger_upper_30_event_decay
0,2004-09-07,0.632143,0.646250,0.629107,0.638571,0.539257,301957600,1,,,...,,,,,,,,,,
1,2004-09-08,0.637500,0.653036,0.637143,0.649107,0.548155,343526400,0,,,...,,,,,,,,,,
2,2004-09-09,0.644643,0.648214,0.630000,0.637500,0.538353,461339200,1,,,...,,,,,,,,,,
3,2004-09-10,0.636786,0.646964,0.633214,0.640536,0.540917,328014400,0,,,...,,,,,,,,,,
4,2004-09-13,0.640714,0.644107,0.630714,0.635536,0.536694,281976800,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5028,2024-08-28,227.919998,229.860001,225.679993,226.490005,226.490005,38052200,1,0.923404,0.006101,...,8.983124e+07,1.042743e+07,crossed,1.0,1.0,0.162311,crossed,1.0,1.0,0.335899
5029,2024-08-29,230.100006,232.919998,228.880005,229.789993,229.789993,51906300,0,1.512537,0.006784,...,8.891226e+07,1.040453e+07,crossed,1.0,1.0,0.159064,crossed,1.0,1.0,0.329181
5030,2024-08-30,230.190002,230.399994,227.479996,229.000000,229.000000,52990800,0,1.821776,0.006832,...,8.905843e+07,1.051431e+07,crossed,1.0,1.0,0.155883,crossed,1.0,1.0,0.322597
5031,2024-09-03,228.550003,229.000000,221.169998,222.770004,222.770004,50042900,0,1.845363,0.007475,...,8.911530e+07,1.058018e+07,crossed,1.0,1.0,0.152765,crossed,1.0,1.0,0.316145


In [173]:
# %%capture
df = percent_change(df, "close", [1, 7, 14, 30])
# df = future_value(df, "close", 1)
# df.tail(10)

In [174]:
df = apply_event(df, dividends, prefix_name="dividends", falloff=True, decay_factor=0.95)
df = apply_event(df, splits, prefix_name="splits", falloff=True, decay_factor=0.95)
df = is_not_nan_column_and_default(df, ["dividends_event_value", "dividends_event_sentiment"])
df = is_not_nan_column_and_default(df, ["splits_event_value", "splits_event_sentiment"])
df

Unnamed: 0,date,open,high,low,close,adj close,volume,up_down_close_1,volitility_open_7,volitility_pct_change_7_open,...,dividends_event_sentiment,dividends_event_decay,splits_event_value,splits_event_name,splits_event_sentiment,splits_event_decay,is_nan_dividends_event_value,is_nan_dividends_event_sentiment,is_nan_splits_event_value,is_nan_splits_event_sentiment
0,2004-09-07,0.632143,0.646250,0.629107,0.638571,0.539257,301957600,1,,,...,0.0,,0.0,,0.0,,0,0,0,0
1,2004-09-08,0.637500,0.653036,0.637143,0.649107,0.548155,343526400,0,,,...,0.0,,0.0,,0.0,,0,0,0,0
2,2004-09-09,0.644643,0.648214,0.630000,0.637500,0.538353,461339200,1,,,...,0.0,,0.0,,0.0,,0,0,0,0
3,2004-09-10,0.636786,0.646964,0.633214,0.640536,0.540917,328014400,0,,,...,0.0,,0.0,,0.0,,0,0,0,0
4,2004-09-13,0.640714,0.644107,0.630714,0.635536,0.536694,281976800,0,,,...,0.0,,0.0,,0.0,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4998,2024-07-17,229.449997,231.460007,226.639999,228.880005,228.615387,57345900,0,3.289316,0.017871,...,1.0,0.099440,4.0,splits,1.0,2.008129e-22,1,1,1,1
4999,2024-07-18,230.279999,230.440002,222.270004,224.179993,223.920807,66034600,1,3.006525,0.017869,...,1.0,0.094468,4.0,splits,1.0,1.907722e-22,1,1,1,1
5000,2024-07-19,224.820007,226.800003,223.279999,224.309998,224.050659,49151500,0,3.906402,0.020045,...,1.0,0.089745,4.0,splits,1.0,1.812336e-22,1,1,1,1
5001,2024-07-22,227.009995,227.779999,223.089996,223.960007,223.701080,48201800,1,4.158552,0.020108,...,1.0,0.085258,4.0,splits,1.0,1.721719e-22,1,1,1,1


In [175]:

df = create_lag_features(df, "open", lag_steps=list(range(1, 30)))
df = create_lag_features(df, "close", lag_steps=list(range(1, 30)))
df = create_lag_features(df, "high", lag_steps=list(range(1, 30)))
df = create_lag_features(df, "low", lag_steps=list(range(1, 30)))
df = create_lag_features(df, "volume", lag_steps=list(range(1, 30)))
df = create_lag_features(df, "close_percent_change_1", lag_steps=[1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60])

df = day_of_week(df)
df = day_of_month(df)
df = month(df)
df = year(df)
df = week_of_year(df)


# df = create_lag_features(df, "event_name", lag_steps=list(range(1, 60)))
df

Unnamed: 0,date,open,high,low,close,adj close,volume,up_down_close_1,volitility_open_7,volitility_pct_change_7_open,...,close_percent_change_1_lag_10,close_percent_change_1_lag_14,close_percent_change_1_lag_21,close_percent_change_1_lag_30,close_percent_change_1_lag_60,day_of_week,day_of_month,month,year,week_of_year
0,2004-09-07,0.632143,0.646250,0.629107,0.638571,0.539257,301957600,1,,,...,,,,,,1,7,9,2004,37
1,2004-09-08,0.637500,0.653036,0.637143,0.649107,0.548155,343526400,0,,,...,,,,,,2,8,9,2004,37
2,2004-09-09,0.644643,0.648214,0.630000,0.637500,0.538353,461339200,1,,,...,,,,,,3,9,9,2004,37
3,2004-09-10,0.636786,0.646964,0.633214,0.640536,0.540917,328014400,0,,,...,,,,,,4,10,9,2004,37
4,2004-09-13,0.640714,0.644107,0.630714,0.635536,0.536694,281976800,0,,,...,,,,,,0,13,9,2004,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4998,2024-07-17,229.449997,231.460007,226.639999,228.880005,228.615387,57345900,0,3.289316,0.017871,...,0.005811,0.003986,0.019671,0.001649,0.005091,2,17,7,2024,29
4999,2024-07-18,230.279999,230.440002,222.270004,224.179993,223.920807,66034600,1,3.006525,0.017869,...,0.021620,-0.016254,-0.010984,0.007821,0.006392,3,18,7,2024,29
5000,2024-07-19,224.820007,226.800003,223.279999,224.309998,224.050659,49151500,0,3.906402,0.020045,...,0.006539,0.029105,-0.021513,-0.007097,0.012702,4,19,7,2024,29
5001,2024-07-22,227.009995,227.779999,223.089996,223.960007,223.701080,48201800,1,4.158552,0.020108,...,0.003775,0.016240,-0.010444,0.012392,0.005147,0,22,7,2024,30


In [176]:
df = macd(df, "close", 50, 200, 9)

In [177]:
df.filter(like= "open")

Unnamed: 0,open,volitility_open_7,volitility_pct_change_7_open,open_rolling_mean_7,open_rolling_exp_mean_7,open_rolling_std_7,open_bollinger_upper_7,open_bollinger_lower_7,volitility_open_14,volitility_pct_change_14_open,...,open_lag_20,open_lag_21,open_lag_22,open_lag_23,open_lag_24,open_lag_25,open_lag_26,open_lag_27,open_lag_28,open_lag_29
0,0.632143,,,,0.632143,,,,,,...,,,,,,,,,,
1,0.637500,,,,0.635204,,,,,,...,,,,,,,,,,
2,0.644643,,,,0.639286,,,,,,...,,,,,,,,,,
3,0.636786,,,,0.638372,,,,,,...,,,,,,,,,,
4,0.640714,,,,0.639139,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4998,229.449997,3.289316,0.017871,231.209998,230.138702,3.289316,237.788631,224.631365,7.897135,0.015991,...,213.369995,213.850006,214.740005,207.369995,193.649994,196.899994,194.649994,195.690002,195.399994,194.639999
4999,230.279999,3.006525,0.017869,231.545713,230.174026,3.006525,237.558762,225.532664,7.463101,0.015779,...,217.589996,213.369995,213.850006,214.740005,207.369995,193.649994,196.899994,194.649994,195.690002,195.399994
5000,224.820007,3.906402,0.020045,230.905714,228.835522,3.906402,238.718517,223.092910,6.895896,0.017563,...,213.929993,217.589996,213.369995,213.850006,214.740005,207.369995,193.649994,196.899994,194.649994,195.690002
5001,227.009995,4.158552,0.020108,230.279999,228.379140,4.158552,238.597102,221.962895,5.518254,0.016635,...,210.389999,213.929993,217.589996,213.369995,213.850006,214.740005,207.369995,193.649994,196.899994,194.649994


In [178]:
df.to_parquet("../data/dataset.parquet")