## Build Functions for ELT

In [1]:
# Install Requirments (Updated on 9/17/2024)
# !pip3 install -r requirements.txt

In [16]:
import pandas as pd
import numpy as np
from yfinance import Ticker
from pykalman import KalmanFilter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.model_selection import cross_val_predict



from src import functions as f

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [121]:
# f.download('META', '1d', 'max')

f.transform('META', '1d', 'max')

curr_prediction, models, feature_names = f.model('META', '1d')

Columns in X before preprocessing:
Index(['top_z21', 'body_z21', 'bottom_z21', 'top_z21', 'body_z21',
       'bottom_z21', 'pct_gap_up_down', 'ac_z5', 'ac_z8', 'ac_z13',
       'kma_sma40_diff_z21', 'adj_close', 'day_of_month', 'day_of_week',
       'hour_of_day'],
      dtype='object')
Model: XGBClassifier
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       209
           1       0.79      0.78      0.78       210
           2       0.77      0.76      0.77       192

    accuracy                           0.76       611
   macro avg       0.76      0.76      0.76       611
weighted avg       0.76      0.76      0.76       611

Model: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.65      0.76      0.70       209
           1       0.80      0.70      0.75       210
           2       0.74      0.70      0.72       192

    accuracy                           0.72       611
   macro

In [122]:
predictions, prediction_probas = f.make_prediction(models, curr_prediction, feature_names)

In [123]:
predictions

{'XGBoost': np.int64(0),
 'RandomForest': np.int64(1),
 'GradientBoosting': np.int64(1),
 'KNN': np.int64(1)}

In [124]:
prediction_probas

{'XGBoost': array([[0.6, 0.4, 0. ]]),
 'RandomForest': array([[0.112185  , 0.884254  , 0.00356105]], dtype=float32),
 'GradientBoosting': array([[0.08227294, 0.91772706, 0.        ]]),
 'KNN': array([[0.22812798, 0.60870418, 0.16316784]])}

## Kelly Criterion

In [7]:
# fraction = p / l - q / g
# p = probability of success
# q = 1 - p = probability or failure
# l = % loss (ex: $10 -> $9 would mean l = .1)
# g = % gain (ex: $10 -> $12 would mean g = .2)

def kelly_c(p, l, g): 
    return p / l - (1 - p) / g

## Hyperparameter Searches

In [86]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

def model(symbol, interval, search_type='none'):
    # Load data
    data = f.load_model_df(symbol, interval)
    data.dropna(inplace=True, axis=0)
    X = data.drop(columns=['direction'], axis=1)
    y = data['direction']
    
    # Print column names to check for issues
    print("Columns in X before preprocessing:")
    print(X.columns)
    
    # Remove duplicate columns
    X = X.loc[:, ~X.columns.duplicated()]
    
    # Check if categorical_features are present in X
    categorical_features = ['day_of_month', 'day_of_week', 'hour_of_day']
    missing_features = [col for col in categorical_features if col not in X.columns]
    if missing_features:
        print(f"Missing categorical features: {missing_features}")
    
    # Store current prediction data (last row)
    curr_prediction = X.iloc[-1].copy()

    # Drop last row from X and y to prevent the model from seeing it
    X = X.iloc[:-1]
    y = y.iloc[:-1]
    
    # Create the categorical transformer
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
    # Create the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)
        ],
        force_int_remainder_cols=False # This will include all other columns in the transformed output
    )
    
    # Define your models
    models = {
        'XGBoost': XGBClassifier(random_state=42, n_jobs=-1, learning_rate=0.1, max_depth=3, n_estimators=100),
        'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=10, min_samples_split=2, n_estimators=200),
        'GradientBoosting': GradientBoostingClassifier(random_state=42, learning_rate=0.5, max_depth=5, n_estimators=300, validation_fraction=0.3, n_iter_no_change=31),
        # 'LightGBM': LGBMClassifier(random_state=42,force_col_wise=True),
        'KNN': KNeighborsClassifier(n_neighbors=7, p=1,weights='distance')
    }
    
    # Hyperparameters to search
    param_grids = {
        'XGBoost': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [3, 5, 7, 9],
            'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3]
        },
        'RandomForest': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10, 13]
        },
        'GradientBoosting': {
            'classifier__n_estimators': [100, 200, 300, 400],
            'classifier__max_depth': [3, 5, 7, 13],
            'classifier__learning_rate': [0.01, 0.1, 0.2, 0.5]
        },
        'KNN': {
            'classifier__n_neighbors': [3, 5, 7, 13],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__p': [1, 2]  # 1: Manhattan, 2: Euclidean
        }
    }
    
    # Create a function to get feature names after transformation
    def get_feature_names_out(column_transformer):
        feature_names = []
        for name, transformer, columns in column_transformer.transformers_:
            if transformer == 'drop' or transformer == 'passthrough':
                if transformer == 'passthrough':
                    feature_names.extend(columns)
                continue
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
                feature_names.extend(names)
            else:
                feature_names.extend(columns)
        return feature_names
    
    # Split data before preprocessing to avoid data leakage
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42)
    
    # Fit the preprocessor on training data
    preprocessor.fit(X_train)
    
    # Transform training and test data
    X_train_transformed = preprocessor.transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    
    # Get feature names after transformation
    feature_names = get_feature_names_out(preprocessor)
    
    # Convert transformed data to DataFrame
    X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
    X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)
    
    # Transform curr_prediction
    curr_prediction_transformed = preprocessor.transform(
        curr_prediction.to_frame().T)
    curr_prediction_transformed = pd.DataFrame(
        curr_prediction_transformed, columns=feature_names)
    
    for model_name, model in models.items():
        # Create a pipeline with the classifier
        pipeline = Pipeline(steps=[
            ('classifier', model)
        ])
        
        # Get the parameter grid for the current model
        param_grid = param_grids.get(model_name, {})
        
        # Use GridSearchCV or RandomizedSearchCV
        if search_type == 'grid' and param_grid:
            search = GridSearchCV(
                pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        elif search_type == 'random' and param_grid:
            search = RandomizedSearchCV(
                pipeline, param_grid, cv=5, scoring='accuracy',
                n_jobs=-1, n_iter=10, random_state=42)
        else:
            search = pipeline
        
        # Fit the model
        search.fit(X_train_transformed, y_train)
        
        # If using search, get the best estimator
        if search_type in ['grid', 'random'] and param_grid:
            best_model = search.best_estimator_
            print(f"Best parameters for {model_name}: {search.best_params_}")
            model = best_model.named_steps['classifier']
        else:
            model = search.named_steps['classifier']
        
        # Store the model
        models[model_name] = model
        
        # Predict on test data
        y_pred = search.predict(X_test_transformed)
        
        # Evaluate the model
        print(f"Model: {model_name}")
        print(classification_report(y_test, y_pred, zero_division=0))
    
    return curr_prediction_transformed, models, feature_names

In [125]:
curr_prediction, models, feature_names = model('META', '1d', 'grid')

Columns in X before preprocessing:
Index(['top_z21', 'body_z21', 'bottom_z21', 'top_z21', 'body_z21',
       'bottom_z21', 'pct_gap_up_down', 'ac_z5', 'ac_z8', 'ac_z13',
       'kma_sma40_diff_z21', 'adj_close', 'day_of_month', 'day_of_week',
       'hour_of_day'],
      dtype='object')
Best parameters for XGBoost: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Model: XGBoost
              precision    recall  f1-score   support

           0       0.37      0.85      0.51       220
           1       0.27      0.09      0.14       199
           2       0.38      0.06      0.11       192

    accuracy                           0.36       611
   macro avg       0.34      0.34      0.25       611
weighted avg       0.34      0.36      0.26       611

Best parameters for RandomForest: {'classifier__max_depth': 10, 'classifier__min_samples_split': 13, 'classifier__n_estimators': 100}
Model: RandomForest
              precision    recall  f