In [1]:
!pip install yfinance
!pip install pycaret



In [3]:
# IMPORTS
import numpy as np
import pandas as pd

#Fin Data Sources
import yfinance as yf
import pandas_datareader as pdr

#Data viz
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px

import time
from datetime import date

# for graphs
import matplotlib.pyplot as plt

In [115]:
df_full = pd.read_parquet("/content/stocks_df_combined_2024_11_01.parquet.brotli")

In [116]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235684 entries, 0 to 2549
Columns: 200 entries, Adj Close_x to growth_btc_usd_365d
dtypes: datetime64[ns](3), float64(124), int32(64), int64(7), object(2)
memory usage: 303.9+ MB


In [117]:
df_full.keys()

Index(['Adj Close_x', 'Close', 'High', 'Low', 'Open', 'Volume', 'Ticker',
       'Year', 'Month', 'Weekday',
       ...
       'growth_brent_oil_7d', 'growth_brent_oil_30d', 'growth_brent_oil_90d',
       'growth_brent_oil_365d', 'growth_btc_usd_1d', 'growth_btc_usd_3d',
       'growth_btc_usd_7d', 'growth_btc_usd_30d', 'growth_btc_usd_90d',
       'growth_btc_usd_365d'],
      dtype='object', length=200)

In [118]:
# growth indicators (but not future growth)
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)& (g.find('future')<0)]

OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']

CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']

TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]

TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV

GROWTH

['growth_1d',
 'growth_3d',
 'growth_7d',
 'growth_30d',
 'growth_90d',
 'growth_365d',
 'growth_dax_1d',
 'growth_dax_3d',
 'growth_dax_7d',
 'growth_dax_30d',
 'growth_dax_90d',
 'growth_dax_365d',
 'growth_snp500_1d',
 'growth_snp500_3d',
 'growth_snp500_7d',
 'growth_snp500_30d',
 'growth_snp500_90d',
 'growth_snp500_365d',
 'growth_dji_1d',
 'growth_dji_3d',
 'growth_dji_7d',
 'growth_dji_30d',
 'growth_dji_90d',
 'growth_dji_365d',
 'growth_epi_1d',
 'growth_epi_3d',
 'growth_epi_7d',
 'growth_epi_30d',
 'growth_epi_90d',
 'growth_epi_365d',
 'growth_wti_oil_1d',
 'growth_wti_oil_3d',
 'growth_wti_oil_7d',
 'growth_wti_oil_30d',
 'growth_wti_oil_90d',
 'growth_wti_oil_365d',
 'growth_brent_oil_1d',
 'growth_brent_oil_3d',
 'growth_brent_oil_7d',
 'growth_brent_oil_30d',
 'growth_brent_oil_90d',
 'growth_brent_oil_365d',
 'growth_btc_usd_1d',
 'growth_btc_usd_3d',
 'growth_btc_usd_7d',
 'growth_btc_usd_30d',
 'growth_btc_usd_90d',
 'growth_btc_usd_365d']

In [119]:
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))

  df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))


In [120]:
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

In [121]:
TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']

FORECAST_IND = ['trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'weekly', 'weekly_lower', 'weekly_upper', 'yearly', 'yearly_lower',
       'yearly_upper', 'multiplicative_terms', 'multiplicative_terms_lower',
       'multiplicative_terms_upper', 'yhat']

In [122]:
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
print(f'Technical patterns count = {len(TECHNICAL_PATTERNS)}, examples = {TECHNICAL_PATTERNS[0:5]}')


Technical patterns count = 61, examples = ['cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike', 'cdl3outside']


In [123]:
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']

In [132]:
NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO #+ FORECAST_IND

In [133]:
# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP]
OTHER

['growth_future_5d',
 'growth_future_3d',
 'growth_future_1d',
 'is_positive_growth_5d_future',
 'is_positive_growth_3d_future',
 'is_positive_growth_1d_future']

In [134]:
df = df_full[df_full.Date>='2000-01-01']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 197217 entries, 3490 to 2549
Columns: 201 entries, Adj Close_x to ln_volume
dtypes: datetime64[ns](3), float64(125), int32(64), int64(7), object(2)
memory usage: 255.8+ MB


In [135]:
# dummy variables are not generated from Date and numeric variables
df.loc[:,'Month'] = df.Month.dt.strftime('%B')
df.loc[:,'Weekday'] = df.Weekday.astype(str)

In [136]:
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')

In [137]:
DUMMIES = dummy_variables.keys().to_list()

In [138]:
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

In [139]:
df_with_dummies[NUMERICAL+DUMMIES].info()

<class 'pandas.core.frame.DataFrame'>
Index: 197217 entries, 3490 to 2549
Columns: 238 entries, growth_1d to ticker_type_US
dtypes: float64(115), int32(122), int64(1)
memory usage: 267.8 MB


In [140]:
def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    """
    Splits a DataFrame into three buckets based on the temporal order of the 'Date' column.

    Args:
        df (DataFrame): The DataFrame to split.
        min_date (str or Timestamp): Minimum date in the DataFrame.
        max_date (str or Timestamp): Maximum date in the DataFrame.
        train_prop (float): Proportion of data for training set (default: 0.6).
        val_prop (float): Proportion of data for validation set (default: 0.2).
        test_prop (float): Proportion of data for test set (default: 0.2).

    Returns:
        DataFrame: The input DataFrame with a new column 'split' indicating the split for each row.
    """
    # Define the date intervals
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    # Assign split labels based on date ranges
    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    # Add 'split' column to the DataFrame
    df['split'] = split_labels

    return df

In [141]:
min_date_df = df_with_dummies.Date.min()
max_date_df = df_with_dummies.Date.max()

df_with_dummies = temporal_split(df_with_dummies,
                                 min_date = min_date_df,
                                 max_date = max_date_df)

In [142]:
new_df = df_with_dummies.copy()

In [143]:
TO_PREDICT

['growth_future_5d',
 'growth_future_3d',
 'growth_future_1d',
 'is_positive_growth_5d_future',
 'is_positive_growth_3d_future',
 'is_positive_growth_1d_future']

In [144]:
new_df['growth_future_1d'].describe()

Unnamed: 0,growth_future_1d
count,197180.0
mean,1.000956
std,0.026693
min,0.230055
25%,0.990423
50%,1.000492
75%,1.010983
max,4.374775


In [145]:
from prophet import Prophet
import pandas as pd

btc = new_df[new_df.Ticker == 'BTC-USD']


PERIODS_TO_PREDICT = 0

# Prepare data in the format Prophet expects
# Prophet requires a DataFrame with columns 'ds' for dates and 'y' for values
df = pd.DataFrame({
    'ds': btc['Date'],  # Assumes train_validation has a datetime index
    'y': btc['Close']   # The time series values
})

# Initialize and configure Prophet model
model = Prophet(
    weekly_seasonality=True,         # Enable weekly seasonality
    daily_seasonality=False,         # Disable daily to avoid overfitting
    yearly_seasonality=True          # Enable yearly if long-term patterns exist
)

# Optional: Add additional seasonalities if needed
# model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

# Fit the model
model.fit(df)

# Forecast for a specified period (e.g., PERIODS_TO_PREDICT days)
future = model.make_future_dataframe(periods=PERIODS_TO_PREDICT)
forecast = model.predict(future)

# Display forecasted values
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(PERIODS_TO_PREDICT))

DEBUG:cmdstanpy:input tempfile: /tmp/tmpthf_hi9p/6lf1xciy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpthf_hi9p/bzmu64e3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7601', 'data', 'file=/tmp/tmpthf_hi9p/6lf1xciy.json', 'init=/tmp/tmpthf_hi9p/bzmu64e3.json', 'output', 'file=/tmp/tmpthf_hi9p/prophet_modelxdk_b3es/prophet_model-20241101113450.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:34:50 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:34:51 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Empty DataFrame
Columns: [ds, yhat, yhat_lower, yhat_upper]
Index: []


In [146]:
forecast.rename(columns={"ds": "Date"}, inplace = True)

In [147]:
forecast.Date = pd.to_datetime(forecast.Date, errors='coerce')
new_df.Date = pd.to_datetime(new_df.Date, errors='coerce')
forecast.set_index('Date', inplace=True)


new_df = pd.merge(new_df,
              forecast,
              how='left',
              left_on='Date',
              right_index=True,
              validate = "many_to_one"
              )

In [164]:
new_df.tail(5)

Unnamed: 0,Adj Close_x,Close,High,Low,Open,Volume,Ticker,Year,Month,Weekday,...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
2545,0.519227,0.519227,0.520838,0.511382,0.516381,837346700.0,XRP-USD,2024,October,0,...,41.940362,41.940362,41.940362,271.252801,271.252801,271.252801,0.0,0.0,0.0,69584.511185
2546,0.527916,0.527916,0.53031,0.518466,0.519226,1076281000.0,XRP-USD,2024,October,1,...,-9.516783,-9.516783,-9.516783,316.714908,316.714908,316.714908,0.0,0.0,0.0,69654.581896
2547,0.522974,0.522974,0.528055,0.519588,0.527919,900858900.0,XRP-USD,2024,October,2,...,27.131417,27.131417,27.131417,350.341211,350.341211,350.341211,0.0,0.0,0.0,69800.922146
2548,0.509237,0.509237,0.52335,0.5039,0.522974,1015745000.0,XRP-USD,2024,October,3,...,-23.601191,-23.601191,-23.601191,372.210318,372.210318,372.210318,0.0,0.0,0.0,69848.124394
2549,0.519187,0.519187,0.519373,0.504883,0.509254,1200762000.0,XRP-USD,2024,November,4,...,-16.901204,-16.901204,-16.901204,382.595052,382.595052,382.595052,0.0,0.0,0.0,69941.274862


In [156]:

features_list = NUMERICAL+DUMMIES+FORECAST_IND
to_predict = 'is_positive_growth_3d_future'

train_df = new_df[new_df.split.isin(['train','validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

X_train = train_df[features_list+[to_predict,'Date','Ticker']]
X_test = test_df[features_list+[to_predict,'Date','Ticker']]

print(f'length: X_train {X_train.shape},  X_test {X_test.shape}')

length: X_train (161172, 259),  X_test (36045, 259)


In [157]:

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_train.shape},  X_test_imputed {X_test.shape}')

length: X_train_imputed (161172, 259),  X_test_imputed (36045, 259)


In [158]:
full_df = pd.concat([X_train, X_test])
btc_df = full_df[full_df.Ticker == 'BTC-USD']
btc_df = btc_df.drop(columns=['Ticker_BTC-USD', 'Ticker','avgprice', 'medprice', 'typprice', 'wclprice'])


btc_df['Year'] = btc_df['Date'].dt.year
btc_df['Month'] = btc_df['Date'].dt.month
btc_df['Day'] = btc_df['Date'].dt.day
btc_df = btc_df.drop(columns=['Date'])

btc_df.shape

(3699, 255)

In [151]:
pd.set_option('display.max_rows', None)

btc_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3689,3690,3691,3692,3693,3694,3695,3696,3697,3698
growth_1d,0.0,0.9280744,0.9301574,1.035735,0.9753415,1.008352,1.083647,0.9711192,0.9725169,0.9826301,...,0.9862056,1.026024,0.9777198,1.005586,1.013648,1.029125,1.040235,0.9947614,0.9706336,0.9881495
growth_3d,0.0,0.0,0.0,0.8941036,0.9396405,1.018632,1.065754,1.06114,1.023429,0.9280251,...,0.9627617,1.011774,0.9893263,1.008768,0.9965998,1.048998,1.085142,1.064924,1.004398,0.9541066
growth_7d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.925374,0.9696871,1.02439,...,0.9825399,1.011294,0.9740367,0.9802811,0.9844583,1.037702,1.079557,1.088923,1.030136,1.041125
growth_30d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.048988,1.060015,1.055418,1.028132,1.032507,1.061015,1.107948,1.142272,1.154153,1.144317
growth_90d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.009957,1.003666,0.9827332,0.9818159,1.016603,1.055992,1.125369,1.106828,1.143289,1.143424
growth_365d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.959563,1.975521,1.951082,1.976263,1.992671,2.024054,2.107696,2.08665,1.981395,1.985878
growth_dax_1d,1.002966,1.014142,1.000115,0.0,0.0,0.9949262,0.9841521,1.006976,0.9842724,0.9979537,...,0.9977195,1.003374,1.001059,0.0,0.0,1.003495,0.9972584,0.9886677,0.9906633,1.00265
growth_dax_3d,1.001074,1.014338,1.017267,0.0,0.0,1.009112,0.9792716,0.9859898,0.9754316,0.989111,...,0.9857687,0.9990653,1.002146,0.0,0.0,1.007947,1.001804,0.9894033,0.9767515,0.9820326
growth_dax_7d,0.9901076,1.009003,1.010215,0.0,0.0,1.006012,0.9941872,1.000242,0.9872396,0.982306,...,0.9933018,0.9977836,1.001584,0.0,0.0,0.9973564,0.9908788,0.9895253,0.9822689,0.9871233
growth_dax_30d,1.05821,1.083987,1.08768,0.0,0.0,1.061956,1.057948,1.050342,1.030884,1.043766,...,1.057138,1.049929,1.040867,0.0,0.0,1.048221,1.040157,1.029172,1.003955,1.0218


In [161]:
from pycaret.classification import *

In [162]:
exp_clf101 = setup(data = btc_df, target = 'is_positive_growth_3d_future',
                   session_id=123, use_gpu=True, index=False, train_size=0.91)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Description,Value
0,Session id,123
1,Target,is_positive_growth_3d_future
2,Target type,Binary
3,Original data shape,"(3699, 255)"
4,Transformed data shape,"(3699, 255)"
5,Transformed train set shape,"(3366, 255)"
6,Transformed test set shape,"(333, 255)"
7,Numeric features,254
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


In [163]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7044,0.7649,0.7715,0.7115,0.7401,0.3986,0.4008,0.502
et,Extra Trees Classifier,0.7029,0.7603,0.7606,0.7144,0.7366,0.3967,0.3979,0.311
xgboost,Extreme Gradient Boosting,0.6934,0.7467,0.7595,0.7031,0.7299,0.3765,0.3785,0.596
lightgbm,Light Gradient Boosting Machine,0.6922,0.7359,0.7666,0.6992,0.7312,0.3729,0.3753,0.391
gbc,Gradient Boosting Classifier,0.6218,0.6606,0.7688,0.6252,0.6893,0.2189,0.227,14.109
knn,K Neighbors Classifier,0.6168,0.6586,0.6431,0.6516,0.6469,0.2279,0.2282,0.084
dt,Decision Tree Classifier,0.6004,0.5968,0.6361,0.6336,0.6344,0.1938,0.1942,0.642
ada,Ada Boost Classifier,0.5808,0.5995,0.6926,0.6008,0.6433,0.1411,0.1433,2.779
ridge,Ridge Classifier,0.5754,0.598,0.6746,0.5985,0.6342,0.1326,0.1341,0.189
lda,Linear Discriminant Analysis,0.5671,0.598,0.6512,0.5946,0.6216,0.1184,0.1191,0.212


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

  master_display_.apply(


In [110]:
# plot_model(best_model, plot = 'auc')

In [48]:
plot_model(best_model, plot='feature')

TypeError: Feature Importance and RFE plots not available for estimators that doesnt support coef_ or feature_importances_ attribute.

In [111]:
# plot_model(best_model, plot = 'confusion_matrix')

In [49]:
predict_model(best_model);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Dummy Regressor,0.0235,0.0012,0.035,-0.0027,0.0175,0.0237


In [107]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [122]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import numpy as np

# Define X and y
X = btc_df.drop(columns=['growth_future_5d'])
y = btc_df['growth_future_5d']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.91, random_state=123)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.3),
        'random_state': 123,
        'use_label_encoder': False,
        'enable_categorical': True,
        'tree_method': 'gpu_hist'
    }

    # Create and train the model
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)

    # Make predictions and calculate MSE
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse  # Optuna will minimize this

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=600)  # 50 trials or 10 minutes

# Best parameters and model
best_params = study.best_params
best_model = XGBRegressor(**best_params, random_state=123, use_label_encoder=False, enable_categorical=True)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Print results
print(f"Best Parameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}")

[I 2024-11-01 07:39:36,456] A new study created in memory with name: no-name-08b1399d-883d-4c3f-83e8-671284a77ec7
[I 2024-11-01 07:39:36,853] Trial 0 finished with value: 0.02327066002683815 and parameters: {'n_estimators': 154, 'learning_rate': 0.09756821294506071, 'max_depth': 3, 'subsample': 0.6717892199804592, 'colsample_bytree': 0.9751524032103661, 'gamma': 0.13212842054524315}. Best is trial 0 with value: 0.02327066002683815.
[I 2024-11-01 07:39:37,444] Trial 1 finished with value: 0.028078923856060246 and parameters: {'n_estimators': 303, 'learning_rate': 0.062427856527106106, 'max_depth': 7, 'subsample': 0.9971426247150059, 'colsample_bytree': 0.8714171345489782, 'gamma': 0.05901680552635262}. Best is trial 0 with value: 0.02327066002683815.
[I 2024-11-01 07:39:37,804] Trial 2 finished with value: 0.018900824588698056 and parameters: {'n_estimators': 152, 'learning_rate': 0.12019422231665244, 'max_depth': 6, 'subsample': 0.6738651919313681, 'colsample_bytree': 0.731524829447620

Best Parameters: {'n_estimators': 158, 'learning_rate': 0.11920035653586827, 'max_depth': 6, 'subsample': 0.6219340335788875, 'colsample_bytree': 0.763458316218074, 'gamma': 0.0004379579579139557}
Mean Squared Error (MSE): 0.017747483000207034
Root Mean Squared Error (RMSE): 0.13321967947794738
Mean Absolute Error (MAE): 0.06708735437003288
Mean Absolute Percentage Error (MAPE): 16470786422979.957
