In [1]:
# Install Requirments (Updated on 12/17/2024)
# !pip3 install -r requirements.txt

In [2]:
import os
import sys

# Suppress TensorFlow logs
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'  # Use asynchronous CUDA memory allocator
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Set TensorFlow log level
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable oneDNN optimizations
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Restrict TensorFlow to GPU 0
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'  # Prevent CUDA errors

# Redirect STDERR to /dev/null to silence C++ warnings
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')

import pandas as pd
import numpy as np
from IPython.display import display
from yfinance import Ticker
from pykalman import KalmanFilter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import cross_val_predict

from datetime import datetime

import tensorflow as tf
import logging

# Suppress TensorFlow and absl logs
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.getLogger('absl').setLevel(logging.ERROR)

# Clear previous sessions
tf.keras.backend.clear_session()

# GPU configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("GPU is configured properly.")
    except RuntimeError as e:
        print("Error initializing GPU:", e)
else:
    print("No GPU detected. TensorFlow will use the CPU.")

sys.stderr = stderr  # Restore STDERR

# Check available GPUs
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))


E0000 00:00:1734897573.049059    5282 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734897573.062515    5282 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


GPU is configured properly.
Num GPUs Available: 1


In [3]:
from src import modules as f # f is for function
%load_ext autoreload
%autoreload 2

### Download, Transform, and Modeling All in One

In [4]:
f.predictions('tsla')

I0000 00:00:1734897586.368817    5282 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1734897586.369189    5282 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14274 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
I0000 00:00:1734897590.509970    6280 cuda_dnn.cc:529] Loaded cuDNN version 90300


TSLA 5m Interval Timestamp: 2024-12-22 14:59:34


Unnamed: 0,XGBoost,GradientBoosting,RandomForest,KNN,LSTM
prediction,up,up,up,down,down
kelly_1:2.5,0.196355,0.202778,0.2,0.153672,0.127796
prob_up,0.428506,0.598791,0.398895,0.25,0.0
prob_static,0.241081,0.144078,0.265654,0.125,0.0
prob_down,0.330413,0.257131,0.335451,0.625,1.0
precision,0.425968,0.430556,0.428571,0.39548,0.376997
recall,0.394515,0.392405,0.405063,0.295359,0.746835
f1,0.409639,0.410596,0.416486,0.338164,0.501062
support,"[474.0, 478.0, 474.0]","[474.0, 478.0, 474.0]","[474.0, 478.0, 474.0]","[474.0, 478.0, 474.0]","[474.0, 478.0, 474.0]"


TSLA 15m Interval Timestamp: 2024-12-22 15:00:55


Unnamed: 0,XGBoost,GradientBoosting,RandomForest,KNN,LSTM
prediction,static,static,static,static,static
kelly_1:2.5,0.349189,0.336842,0.33198,0.215741,0.372727
prob_up,0.238062,0.121056,0.268735,0.25,0.0
prob_static,0.559355,0.805649,0.442713,0.5,1.0
prob_down,0.202583,0.073294,0.288552,0.25,0.0
precision,0.535135,0.526316,0.522843,0.439815,0.551948
recall,0.622642,0.566038,0.647799,0.597484,0.534591
f1,0.575581,0.545455,0.578652,0.506667,0.543131
support,"[158.0, 159.0, 159.0]","[158.0, 159.0, 159.0]","[158.0, 159.0, 159.0]","[158.0, 159.0, 159.0]","[158.0, 159.0, 159.0]"


TSLA 1h Interval Timestamp: 2024-12-22 15:01:14


Unnamed: 0,XGBoost,GradientBoosting,RandomForest,KNN,LSTM
prediction,up,down,down,up,up
kelly_1:2.5,0.156506,0.167123,0.161337,0.068696,0.065534
prob_up,0.444387,0.293295,0.328379,0.75,1.0
prob_static,0.270454,0.246292,0.19826,0.0,0.0
prob_down,0.285159,0.460413,0.473361,0.25,0.0
precision,0.397504,0.405088,0.400955,0.334783,0.332524
recall,0.406934,0.377737,0.306569,0.281022,1.0
f1,0.402164,0.390935,0.347466,0.305556,0.499089
support,"[548.0, 552.0, 548.0]","[548.0, 552.0, 548.0]","[548.0, 552.0, 548.0]","[548.0, 552.0, 548.0]","[548.0, 552.0, 548.0]"


TSLA 1d Interval Timestamp: 2024-12-22 15:03:08


Unnamed: 0,XGBoost,GradientBoosting,RandomForest,KNN,LSTM
prediction,down,down,down,up,static
kelly_1:2.5,0.056388,0.082759,0.040609,0.118519,0.067956
prob_up,0.159949,0.126984,0.246712,0.5,0.0
prob_static,0.313583,0.230544,0.215593,0.25,1.0
prob_down,0.526468,0.642473,0.537695,0.25,0.0
precision,0.325991,0.344828,0.314721,0.37037,0.334254
recall,0.307054,0.33195,0.257261,0.373444,1.0
f1,0.316239,0.338266,0.283105,0.371901,0.501035
support,"[241.0, 242.0, 241.0]","[241.0, 242.0, 241.0]","[241.0, 242.0, 241.0]","[241.0, 242.0, 241.0]","[241.0, 242.0, 241.0]"


TSLA 1wk Interval Timestamp: 2024-12-22 15:03:53


Unnamed: 0,XGBoost,GradientBoosting,RandomForest,KNN,LSTM
prediction,down,down,down,up,up
kelly_1:2.5,0.088372,0.1,0.138462,0.154717,0.066667
prob_up,0.213893,0.102329,0.231424,0.375,1.0
prob_static,0.064094,0.041408,0.168821,0.25,0.0
prob_down,0.722013,0.856263,0.599755,0.375,0.0
precision,0.348837,0.357143,0.384615,0.396226,0.333333
recall,0.3,0.3,0.3,0.428571,1.0
f1,0.322581,0.326087,0.337079,0.411765,0.5
support,"[49.0, 48.0, 50.0]","[49.0, 48.0, 50.0]","[49.0, 48.0, 50.0]","[49.0, 48.0, 50.0]","[49.0, 48.0, 50.0]"


TSLA 1mo Interval Timestamp: 2024-12-22 15:04:12


Unnamed: 0,XGBoost,GradientBoosting,RandomForest,KNN,LSTM
prediction,up,up,up,up,down
kelly_1:2.5,0.44,0.109091,0.3,-0.05,0.066667
prob_up,0.90895,0.998518,0.549902,0.375,0.0
prob_static,0.039017,0.000824,0.221069,0.25,0.0
prob_down,0.052034,0.000658,0.229029,0.375,1.0
precision,0.6,0.363636,0.5,0.25,0.333333
recall,0.6,0.4,0.6,0.3,1.0
f1,0.6,0.380952,0.545455,0.272727,0.5
support,"[10.0, 10.0, 10.0]","[10.0, 10.0, 10.0]","[10.0, 10.0, 10.0]","[10.0, 10.0, 10.0]","[10.0, 10.0, 10.0]"


In [None]:
symbol='NVDA'
interval='1d'

# # Define Eastern Time Zone
# eastern = pytz.timezone('US/Eastern')

# # Get current time in Eastern Time Zone
# eastern_time = datetime.now(eastern)

# # Format the time to include hour, minute, and seconds
# time_stamp = eastern_time.strftime('%Y-%m-%d %H:%M:%S')

# print(f'DL Time: {time_stamp}')

# f.download(symbol, interval, period)
f.transform(symbol, interval)
curr_prediction, models, feature_names, classification_reports = f.model(symbol, interval)
predictions, prediction_probas = f.make_prediction(models, curr_prediction, feature_names)

f.predictions_summary(predictions, prediction_probas, classification_reports)

In [None]:
models

In [None]:
models['XGBoost'].feature_importances_

In [None]:
feature_names

In [None]:
merged_list = sorted(zip(feature_names, models['XGBoost'].feature_importances_),
                     key=lambda x: x[1], 
                     reverse=True
                    )
merged_list

In [None]:
predictions

In [None]:
classification_reports

### Hyperparameter Searches

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

def model(symbol, interval, search_type='none'):
    # Load data
    data = f.load_model_df(symbol, interval)
    data.dropna(inplace=True, axis=0)
    X = data.drop(columns=['direction'], axis=1)
    y = data['direction']
    
    # Print column names to check for issues
    print("Columns in X before preprocessing:")
    print(X.columns)
    
    # Remove duplicate columns
    X = X.loc[:, ~X.columns.duplicated()]
    
    # Check if categorical_features are present in X
    categorical_features = ['day_of_month', 'day_of_week', 'hour_of_day']
    missing_features = [col for col in categorical_features if col not in X.columns]
    if missing_features:
        print(f"Missing categorical features: {missing_features}")
    
    # Store current prediction data (last row)
    curr_prediction = X.iloc[-1].copy()

    # Drop last row from X and y to prevent the model from seeing it
    X = X.iloc[:-1]
    y = y.iloc[:-1]
    
    # Create the categorical transformer
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
    # Create the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)
        ],
        force_int_remainder_cols=False # This will include all other columns in the transformed output
    )
    
    # Define your models
    models = {
        'XGBoost': XGBClassifier(random_state=42, n_jobs=-1),
        'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
        'GradientBoosting': GradientBoostingClassifier(random_state=42, validation_fraction=0.25, n_iter_no_change=31),
        # 'LightGBM': LGBMClassifier(random_state=42,force_col_wise=True),
        'KNN': KNeighborsClassifier(n_neighbors=7, p=1,weights='distance')
    }
    
    # Hyperparameters to search
    param_grids = {
        'XGBoost': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [3, 5, 7, 9],
            'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3]
        },
        'RandomForest': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10, 13]
        },
        'GradientBoosting': {
            'classifier__n_estimators': [100, 200, 300, 400],
            'classifier__max_depth': [3, 5, 7, 13],
            'classifier__le||arning_rate': [0.01, 0.1, 0.2, 0.5]
        },
        'KNN': {
            'classifier__n_neighbors': [3, 5, 7, 13],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__p': [1, 2]  # 1: Manhattan, 2: Euclidean
        }
    }
    
    # Create a function to get feature names after transformation
    def get_feature_names_out(column_transformer):
        feature_names = []
        for name, transformer, columns in column_transformer.transformers_:
            if transformer == 'drop' or transformer == 'passthrough':
                if transformer == 'passthrough':
                    feature_names.extend(columns)
                continue
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
                feature_names.extend(names)
            else:
                feature_names.extend(columns)
        return feature_names
    
    # Split data before preprocessing to avoid data leakage
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42)
    
    # Fit the preprocessor on training data
    preprocessor.fit(X_train)
    
    # Transform training and test data
    X_train_transformed = preprocessor.transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    
    # Get feature names after transformation
    feature_names = get_feature_names_out(preprocessor)
    
    # Convert transformed data to DataFrame
    X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
    X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)
    
    # Transform curr_prediction
    curr_prediction_transformed = preprocessor.transform(
        curr_prediction.to_frame().T)
    curr_prediction_transformed = pd.DataFrame(
        curr_prediction_transformed, columns=feature_names)
    
    for model_name, model in models.items():
        # Create a pipeline with the classifier
        pipeline = Pipeline(steps=[
            ('classifier', model)
        ])
        
        # Get the parameter grid for the current model
        param_grid = param_grids.get(model_name, {})
        
        # Use GridSearchCV or RandomizedSearchCV
        if search_type == 'grid' and param_grid:
            search = GridSearchCV(
                pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        elif search_type == 'random' and param_grid:
            search = RandomizedSearchCV(
                pipeline, param_grid, cv=5, scoring='accuracy',
                n_jobs=-1, n_iter=10, random_state=42)
        else:
            search = pipeline
        
        # Fit the model
        search.fit(X_train_transformed, y_train)
        
        # If using search, get the best estimator
        if search_type in ['grid', 'random'] and param_grid:
            best_model = search.best_estimator_
            print(f"Best parameters for {model_name}: {search.best_params_}")
            model = best_model.named_steps['classifier']
        else:
            model = search.named_steps['classifier']
        
        # Store the model
        models[model_name] = model
        
        # Predict on test data
        y_pred = search.predict(X_test_transformed)
        
        # Evaluate the model
        print(f"Model: {model_name}")
        print(classification_report(y_test, y_pred, zero_division=0))
    
    return curr_prediction_transformed, models, feature_names

In [None]:
curr_prediction, models, feature_names = model('AMD', '5m', 'grid')