## Build Functions for ELT

In [1]:
# Install Requirments (Updated on 9/17/2024)
# !pip3 install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
from yfinance import Ticker
from pykalman import KalmanFilter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.model_selection import cross_val_predict



from src import elt as et

%load_ext autoreload
%autoreload 2

In [20]:
def download(symbol, interval, period):
    stock = Ticker(symbol)
    stock_df = stock.history(interval=interval,
                             period=period,
                             auto_adjust=False,
                             prepost=True, # include aftermarket hours
                            )
    stock_df.columns = stock_df.columns.str.lower().str.replace(' ', '_')
    stock_df.to_pickle(f'./data/{symbol}_{interval}_df.pkl')
    
def load(symbol, interval):
    return pd.read_pickle(f'./data/{symbol}_{interval}_df.pkl')

def load_model_df(symbol, interval):
    return pd.read_pickle(f'./models/{symbol}_{interval}_model_df.pkl')

#########################################
# functions for use to transform tables #
#########################################

# candle parts percentages
def candle_parts_pcts(o, c, h, l):
    full = h - l
    if full == 0:
        # If full is zero, return 0 for all components to avoid division by zero
        return 0, 0, 0
    body = abs(o - c)
    if o > c:
        top_wick = h - o
        bottom_wick = c - l
    else:
        top_wick = h - c
        bottom_wick = o - l
    return top_wick / full, body / full, bottom_wick / full

# previous close and open gap % of pervious candle size
def gap_up_down_pct(o, pc, ph, pl):
    if o == pc:
        return 0
    else:
        return (o - pc) / (ph - pl)
    
    
# z-score calculation
def zscore(x, mu, stdev):
    return (x - mu) / stdev

# direction calculation:
def direction(pctc, mean, stdev):
    
    pct_pos = mean + 0.43073 / 2 * stdev
    pct_neg = mean - 0.43073 / 2 * stdev
    if pctc >= pct_pos:
        return 1
    elif pctc <= pct_neg:
        return 2
    else:
        return 0


def transform(symbol, interval, period):
    
    if load(symbol, interval).shape[0] > 0:
        df = load(symbol, interval)
        
    else:
        download(symbol, interval, period)
        df = load(symbol,interval)
    
    # Kalman filtering (noise reduction algorithm) 
    kf = KalmanFilter(transition_matrices = [1],
                      observation_matrices = [1],
                      initial_state_mean = 0,
                      initial_state_covariance = 1,
                      observation_covariance=1,
                      transition_covariance=0.01
                     )

    state_means, _ = kf.filter(df['adj_close'].values)
    state_means = pd.Series(state_means.flatten(), index=df.index)
    df['kma'] = state_means
    df['sma40'] = df['adj_close'].rolling(window=40).mean().copy()
    df['kma_sma40_diff'] = (df['kma'] - df['sma40']).copy()
    df['kma_sma40_diff_stdev21'] = df['kma_sma40_diff'].rolling(window=21).std().copy()
    df['kma_sma40_diff_mu21'] = df['kma_sma40_diff'].rolling(window=21).mean().copy()

    # Calculate Kalman Filter vs SMA40 difference z-score
    df['kma_sma40_diff_z21'] = df.apply(lambda row: zscore(row['kma_sma40_diff'], row['kma_sma40_diff_mu21'], row['kma_sma40_diff_stdev21']), axis=1, result_type='expand').copy()

    #update 1 day table: candle parts %'s
    df[['pct_top_wick', 'pct_body', 'pct_bottom_wick']] = df.apply(lambda row: candle_parts_pcts(row['open'], row['close'], row['high'],  row['low']), axis=1, result_type='expand').copy()

    #stdev of adjusted close
    df['top_stdev21'] = df['pct_top_wick'].rolling(window=21).std().copy() 
    df['body_stdev21'] = df['pct_body'].rolling(window=21).std().copy() 
    df['bottom_stdev21'] = df['pct_bottom_wick'].rolling(window=21).std().copy()

    #mean of adjusted close
    df['top_mu21'] = df['pct_top_wick'].rolling(window=21).mean().copy() 
    df['body_mu21'] = df['pct_body'].rolling(window=21).mean().copy() 
    df['bottom_mu21'] = df['pct_bottom_wick'].rolling(window=21).mean().copy()

    #z-score of adjusted close
    df['top_z21'] = df.apply(lambda row: zscore(row['pct_top_wick'], row['top_mu21'], row['top_stdev21']), axis=1, result_type='expand').copy()
    df['body_z21'] = df.apply(lambda row: zscore(row['pct_body'], row['body_mu21'], row['body_stdev21']), axis=1, result_type='expand').copy()
    df['bottom_z21'] = df.apply(lambda row: zscore(row['pct_bottom_wick'], row['bottom_mu21'], row['bottom_stdev21']), axis=1, result_type='expand').copy()

    #update 1 day table: % gap btwn current open relative to previous candle size
    df['pc'] = df['close'].shift(1).copy()
    df['ph'] = df['high'].shift(1).copy()
    df['pl'] = df['low'].shift(1).copy()
    df['pct_gap_up_down'] = df.apply(lambda row: gap_up_down_pct(row['open'], row['pc'], row['ph'], row['pl']), axis=1, result_type='expand').copy()

    #stdev of adjusted close
    df['ac_stdev5'] = df['adj_close'].rolling(window=5).std().copy() 
    df['ac_stdev8'] = df['adj_close'].rolling(window=8).std().copy() 
    df['ac_stdev13'] = df['adj_close'].rolling(window=13).std().copy()

    #mean of adjusted close
    df['ac_mu5'] = df['adj_close'].rolling(window=5).mean().copy() 
    df['ac_mu8'] = df['adj_close'].rolling(window=8).mean().copy() 
    df['ac_mu13'] = df['adj_close'].rolling(window=13).mean().copy()

    #z-score of adjusted close
    df['ac_z5'] = df.apply(lambda row: zscore(row['adj_close'], row['ac_mu5'], row['ac_stdev5']), axis=1, result_type='expand').copy()
    df['ac_z8'] = df.apply(lambda row: zscore(row['adj_close'], row['ac_mu8'], row['ac_stdev8']), axis=1, result_type='expand').copy()
    df['ac_z13'] = df.apply(lambda row: zscore(row['adj_close'], row['ac_mu13'], row['ac_stdev13']), axis=1, result_type='expand').copy()

    #target column: direction: -1, 0, 1
    df['adj_close_pctc'] = df['adj_close'].pct_change()
    mean = df['adj_close_pctc'].mean()
    stdev = df['adj_close_pctc'].std()
    df['direction'] = df.apply(lambda row: direction(row['adj_close_pctc'], mean, stdev), axis=1, result_type='expand').copy() 

    # day of month, week, hour of day
    df['day_of_month'] = df.index.day        # Day of the month (1-31)
    df['day_of_week'] = df.index.weekday     # Day of the week (0 = Monday, 6 = Sunday)
    df['hour_of_day'] = df.index.hour        # Hour of the day (0-23)
  
    # categorical features
    categorical_features = ['day_of_month',
                                'day_of_week',
                                'hour_of_day']
    
    # Change data types of categorical columns to 'category'
    for column in categorical_features:
        df[column] = df[column].astype('category')
    
    # save 1d file for model building
    df[['top_z21', 
        'body_z21', 
        'bottom_z21',
        'top_z21',
        'body_z21',
        'bottom_z21',
        'pct_gap_up_down',
        'ac_z5',
        'ac_z8',
        'ac_z13',
        'kma_sma40_diff_z21',
        'adj_close',
        'day_of_month',
        'day_of_week',
        'hour_of_day',
        'direction',
       ]
      ].to_pickle(f'./models/{symbol}_{interval}_model_df.pkl')

    
def model(symbol, interval):
    # Load data
    data = load_model_df(symbol, interval)
    data.dropna(inplace=True, axis=0)
    X = data.drop(columns=['direction'], axis=1)
    y = data['direction']
    
    # Print column names to check for issues
    print("Columns in X before preprocessing:")
    print(X.columns)
    
    # Remove duplicate columns
    X = X.loc[:, ~X.columns.duplicated()]

    # Check if categorical_features are present in X
    categorical_features = ['day_of_month', 'day_of_week', 'hour_of_day']
    missing_features = [col for col in categorical_features if col not in X.columns]
    if missing_features:
        print(f"Missing categorical features: {missing_features}")

    # Make categorical transformer
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # This will include all other columns in the transformed output
    )
    
    # Define your models
    models = {
        'XGBoost': XGBClassifier(random_state=42, n_jobs=-1),
        'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
        # 'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=300),
        'SVC': SVC(random_state=42)
    }
    
    # Create a pipeline that first preprocesses the data and then trains the model
    pipelines = {}
    for model_name, model in models.items():
        pipelines[model_name] = Pipeline(steps=[('preprocessor', preprocessor),
                                                ('classifier', model)])
    
    #     X_validation_dfs = {}
    #     y_validation_series = {}
    models = {}
    
    # Create a function to get the column names after transformation
    def get_feature_names_out(column_transformer):
        feature_names = []
        for name, transformer, columns in column_transformer.transformers_:
            if hasattr(transformer, 'get_feature_names_out'):
                feature_names.extend(transformer.get_feature_names_out())
            else:
                feature_names.extend(columns)
        return feature_names

    for model_name, pipeline in pipelines.items():
        # Apply the pipeline's preprocessor to the data
        X_transformed = pipeline.named_steps['preprocessor'].fit_transform(X)

        # Get feature names after transformation
        feature_names = get_feature_names_out(pipeline.named_steps['preprocessor'])

        # Convert the sparse matrix to a dense array and then to a DataFrame with proper column names
        X_transformed = pd.DataFrame(X_transformed, columns=feature_names)

        # Store current prediction data 
        curr_prediction = X_transformed.iloc[-1].copy()

        # Drop last row, model can't see this because it is used for prediction
        X_transformed = X_transformed.iloc[:-1]
        
        # Take a 3.5% sample for validation for validation
        #         X_validate = X_transformed.sample(frac=0.035)
        #         X_transformed.drop(X_validate.index,axis=0,inplace=True)
        #         y_validate = y[X_validate.index].copy()
        #         y = y[~X_validate.index]
        
        # Store validate dataframes in validaion_dfs
        #         X_validation_dfs[pipeline.named_steps['classifier']] = X_validate
        #         y_validation_series[pipeline.named_steps['classifier']] = y_validate

        # Now perform train_test_split on the transformed data
        X_train, X_test, y_train, y_test = train_test_split(X_transformed, y[:-1], test_size=0.2, random_state=42)

        # cols
        cols = X_train.columns
        
        # Fit and evaluate the model
        model = pipeline.named_steps['classifier']
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # store model in models dictionary
        models[pipeline.named_steps['classifier']] = model
        
        # Evaluation metrics
        #         accuracy = accuracy_score(y_test, y_pred)
        #         precision = precision_score(y_test, y_pred, average='weighted')
        #         recall = recall_score(y_test, y_pred, average='weighted')
        #         f1 = f1_score(y_test, y_pred, average='weighted')

        print(f"Model: {model.__class__.__name__}")
        #         print(f"Accuracy: {accuracy:.4f}")
        #         print(f"Precision: {precision:.4f}")
        #         print(f"Recall: {recall:.4f}")
        #         print(f"F1 Score: {f1:.4f}")
        #         print("\n")
        
        # Evaluate
        print(classification_report(y_test, y_pred, zero_division=0))
    
    return curr_prediction, models, cols





In [7]:
download('NVDA', '15m', '1mo')

In [21]:
curr_prediction, models, cols = model('NVDA', '15m')

Columns in X before preprocessing:
Index(['top_z21', 'body_z21', 'bottom_z21', 'top_z21', 'body_z21',
       'bottom_z21', 'pct_gap_up_down', 'ac_z5', 'ac_z8', 'ac_z13',
       'kma_sma40_diff_z21', 'adj_close', 'day_of_month', 'day_of_week',
       'hour_of_day'],
      dtype='object')
Model: XGBClassifier
              precision    recall  f1-score   support

           0       0.74      0.84      0.78        80
           1       0.77      0.72      0.75        79
           2       0.83      0.78      0.80        95

    accuracy                           0.78       254
   macro avg       0.78      0.78      0.78       254
weighted avg       0.78      0.78      0.78       254

Model: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.70      0.80      0.74        80
           1       0.73      0.71      0.72        79
           2       0.84      0.75      0.79        95

    accuracy                           0.75       254
   macro

In [22]:
curr_prediction

day_of_month_3          0.000000
day_of_month_4          0.000000
day_of_month_5          0.000000
day_of_month_6          0.000000
day_of_month_9          0.000000
day_of_month_10         0.000000
day_of_month_11         0.000000
day_of_month_12         0.000000
day_of_month_13         0.000000
day_of_month_16         0.000000
day_of_month_17         1.000000
day_of_month_19         0.000000
day_of_month_20         0.000000
day_of_month_21         0.000000
day_of_month_22         0.000000
day_of_month_23         0.000000
day_of_month_26         0.000000
day_of_month_27         0.000000
day_of_month_28         0.000000
day_of_month_29         0.000000
day_of_month_30         0.000000
day_of_week_0           0.000000
day_of_week_1           1.000000
day_of_week_2           0.000000
day_of_week_3           0.000000
day_of_week_4           0.000000
hour_of_day_4           0.000000
hour_of_day_5           0.000000
hour_of_day_6           0.000000
hour_of_day_7           0.000000
hour_of_da

In [23]:
cols

Index(['day_of_month_3', 'day_of_month_4', 'day_of_month_5', 'day_of_month_6',
       'day_of_month_9', 'day_of_month_10', 'day_of_month_11',
       'day_of_month_12', 'day_of_month_13', 'day_of_month_16',
       'day_of_month_17', 'day_of_month_19', 'day_of_month_20',
       'day_of_month_21', 'day_of_month_22', 'day_of_month_23',
       'day_of_month_26', 'day_of_month_27', 'day_of_month_28',
       'day_of_month_29', 'day_of_month_30', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'hour_of_day_4',
       'hour_of_day_5', 'hour_of_day_6', 'hour_of_day_7', 'hour_of_day_8',
       'hour_of_day_9', 'hour_of_day_10', 'hour_of_day_11', 'hour_of_day_12',
       'hour_of_day_13', 'hour_of_day_14', 'hour_of_day_15', 'hour_of_day_16',
       'hour_of_day_17', 'hour_of_day_18', 'hour_of_day_19', 'top_z21',
       'body_z21', 'bottom_z21', 'pct_gap_up_down', 'ac_z5', 'ac_z8', 'ac_z13',
       'kma_sma40_diff_z21', 'adj_close'],
      dtype='object')