In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm



### Data preprocessing and feature engineering functions

In [4]:
def load_data(filepath):
    """
    Load data from a specified CSV file and convert the 'date' column to datetime format.

    Args:
        filepath (str): The path to the CSV file to be loaded.

    Returns:
        pd.DataFrame: A DataFrame with the 'date' column converted to datetime objects.
    """
    data = pd.read_csv(filepath)
    data['date'] = pd.to_datetime(data['date'])
    return data

def process_date_column(df, date_column):
    """
    Convert a specified column in a DataFrame to datetime and extract day, month, and year components.
    
    Args:
    df (pd.DataFrame): DataFrame containing the data.
    date_column (str): Name of the column to convert to datetime and extract components.

    Returns:
    pd.DataFrame: The original DataFrame with the date column converted to datetime and new columns for day, month, and year.
    """
    # Convert the column to datetime
    df[date_column] = pd.to_datetime(df[date_column])

    # Extract day, month, and year into separate columns
    df[f'{date_column}_day'] = df[date_column].dt.day
    df[f'{date_column}_month'] = df[date_column].dt.month
    df[f'{date_column}_year'] = df[date_column].dt.year

    return df


def create_lagged_features(df, column_name, lags):
    """
    Create lagged features for specified time lags.

    Args:
    df (pd.DataFrame): DataFrame containing the data.
    column_name (str): Name of the column to create lag features for.
    lags (list of int): List containing the lag periods.

    Returns:
    pd.DataFrame: DataFrame with new columns for each lag feature.
    """
    for lag in lags:
        df[f'{column_name}_lag_{lag}'] = df[column_name].shift(lag)
    return df

def calculate_rolling_statistics(df, column_prefix, windows):
    """
    Calculate rolling mean and standard deviation for each window size.

    Args:
    df (pd.DataFrame): DataFrame containing the data.
    column_prefix (str): Prefix of the column names to calculate rolling stats for.
    windows (list of int): List containing the window sizes.

    Returns:
    pd.DataFrame: DataFrame with rolling mean and std added as new columns.
    """
    for window in windows:
        df[f'{column_prefix}_mean_{window}d'] = df[f'{column_prefix}_lag_{window}'].rolling(window=window, min_periods=1).mean()
        df[f'{column_prefix}_std_{window}d'] = df[f'{column_prefix}_lag_{window}'].rolling(window=window, min_periods=1).std()
    return df



In [14]:
df = load_data('/home/amy/work/RIT/TDess/DSCI-601-Amy/Data/Combined/combined_AAPL.csv')


In [15]:
df = process_date_column(df, 'date')

#### Setting the index

In [16]:
df.set_index('date', inplace=True)

In [19]:
# Define lags and window sizes
lags = [1, 7, 30]
windows = [1, 7, 30] # for rolling window

# Create lagged features
df = create_lagged_features(df, 'RET', lags)

In [21]:
df = calculate_rolling_statistics(df, 'RET', windows)

In [23]:
print(df.head())

                 RET  VOL_CHANGE  BA_SPREAD   ILLIQUIDITY    sprtrn  \
date                                                                  
1992-01-02  0.055432    0.717745   0.008403  4.510000e-10  0.000408   
1992-01-03 -0.008403   -0.172890   0.004237 -8.340000e-11  0.004985   
1992-01-06 -0.016949   -0.399632   0.004310 -2.850000e-10 -0.003291   
1992-01-07  0.019397    0.237283   0.004228  2.590000e-10 -0.001340   
1992-01-08  0.023256    0.645321   0.004132  1.840000e-10  0.001677   

             TURNOVER  DJI_Return  date_day  date_month  date_year  RET_lag_1  \
date                                                                            
1992-01-02  17.419850    0.000000         2           1       1992        NaN   
1992-01-03  14.408127    0.009173         3           1       1992   0.055432   
1992-01-06   8.650181   -0.000437         6           1       1992  -0.008403   
1992-01-07  10.702726    0.001469         7           1       1992  -0.016949   
1992-01-08  17.6