In [None]:
#!git clone https://github.com/triet4p/itapia.git

# 1. Preparing data and library

In [None]:
import sys
import os
sys.path.append("./ai_service_quick")

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = 'trietp1253201581'
os.environ['KAGGLE_KEY'] = '...'

In [None]:
#!pip install python-dotenv

In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import json

from numpy.lib.stride_tricks import sliding_window_view

# Machine Learning & Validation
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor, LGBMClassifier 
#from xgboost import XGBRegressor, XGBClassifier
# Hyperparameter Tuning (khuyến nghị)
import optuna 

# Explainability
import shap

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from app.forecasting.training import TrainingOrchestrator
from app.forecasting.task.triple_barrier import TripleBarrierTask, find_triple_barrier_optimal_params
from app.forecasting.task.ndays_distribution import NDaysDistributionTask
from app.forecasting.model import ScikitLearnForecastingModel
from app.forecasting.training.data_split import train_test_split
from app.forecasting.post_processing import NDaysDistributionPostProcessor, RoundingProcessor
from app.forecasting.training.optim import LGBMClassifierObjective, MultiOutLGBMRegressionObjective, get_best_params_for_kernel_model
from app.forecasting.training.data_split import get_walk_forward_splits
from app.core.utils import FORECASTING_TRAINING_BONUS_FEATURES, FORECASTING_TRAINING_SCORE_WEIGHTS
import app.core.config as cfg

# Cấu hình notebook
pd.set_option('display.max_columns', 100)
shap.initjs()

In [None]:
SECTOR = 'ENER'

In [None]:
enriched_df = pd.read_csv(f'/kaggle/input/itapia-training-data/training_{SECTOR}.csv', index_col='datetime_utc')
enriched_df.index = pd.to_datetime(enriched_df.index)
df = enriched_df.copy()

In [None]:
train_test_split_date = datetime(2025, 1, 2)
test_last_date = datetime(2025, 6, 20)
df_train, df_test = train_test_split(df, train_test_split_date, test_last_date)

In [None]:
best_params, results_df = find_triple_barrier_optimal_params(df_train, df_test,
                                                             'close',
                                                             horizons=[5, 10, 15, 20],
                                                             tp_pcts=np.arange(0.025, 0.1, 0.01),
                                                             sl_pcts=np.arange(0.01, 0.07, 0.007))

In [None]:
orchestrator = TrainingOrchestrator(df)

In [None]:
task1_id = cfg.TASK_ID_SECTOR_TEMPLATE.format(
    problem=cfg.TRIPLE_BARRIER_PROBLEM_ID,
    sector=SECTOR
)
task1 = TripleBarrierTask(task1_id,
                          best_params['h'], best_params['tp_pct'], best_params['sl_pct'], 
                          7, 45)
model1 = ScikitLearnForecastingModel('LGBM')
orchestrator.register_model_for_task(model1, task1)

In [None]:
rnd_prc = RoundingProcessor(4)

In [None]:
task2_id = cfg.TASK_ID_SECTOR_TEMPLATE.format(
    problem=cfg.REG_5D_DIS_PROBLEM_ID,
    sector=SECTOR
)
task2 = NDaysDistributionTask(task2_id,
                              5, 7, 45)

task3_id = cfg.TASK_ID_SECTOR_TEMPLATE.format(
    problem=cfg.REG_20D_DIS_PROBLEM_ID,
    sector=SECTOR
)
task3 = NDaysDistributionTask(task3_id,
                              20, 7, 45)

In [None]:
n5d_prc = NDaysDistributionPostProcessor(task2)
n20d_prc = NDaysDistributionPostProcessor(task3)

In [None]:
model_2 = ScikitLearnForecastingModel('Multi-LGBM', 
                                      post_processors=[n5d_prc, rnd_prc])
model_3 = ScikitLearnForecastingModel('Multi-LGBM',
                                      post_processors=[n20d_prc, rnd_prc])
orchestrator.register_model_for_task(model_2, task2)
orchestrator.register_model_for_task(model_3, task3)

In [None]:
orchestrator.prepare_all_targets()

In [None]:
orchestrator.run_feature_selection(weights=FORECASTING_TRAINING_SCORE_WEIGHTS,
                                   bonus_features=FORECASTING_TRAINING_BONUS_FEATURES,
                                   bonus_multiplier=1.1)

In [None]:
train_test_split_date = datetime(2024, 12, 31)
test_last_date = datetime(2025, 5, 31)
orchestrator.split_data(train_test_split_date, test_last_date)

In [None]:
train_df = orchestrator._train_df.copy()
#train_df = train_df[train_df.index >= pd.to_datetime('2023-01-01')]
generator = get_walk_forward_splits(train_df, validation_months=4, max_train_months=30)
obj1 = LGBMClassifierObjective(model1, train_df,
                               direction='maximize',
                               generator=generator,
                               time_weighted='new-prior',
                               weight_bias=1,
                               max_cv=3)

In [None]:
generator = get_walk_forward_splits(train_df, validation_months=4, max_train_months=30)
obj2 = MultiOutLGBMRegressionObjective(model_2, train_df,
                                       direction='minimize',
                                       generator=generator,
                                       time_weighted='new-prior',
                                       weight_bias=1,
                                       max_cv=3)

generator = get_walk_forward_splits(train_df, validation_months=4, max_train_months=30)
obj3 = MultiOutLGBMRegressionObjective(model_3, train_df,
                                       direction='minimize',
                                       generator=generator,
                                       time_weighted='new-prior',
                                       weight_bias=1,
                                       max_cv=3)

In [None]:
md1_params = get_best_params_for_kernel_model(obj1, n_trials=120)
model1.kernel_model_template = LGBMClassifier(**md1_params)

In [None]:
md2_params = get_best_params_for_kernel_model(obj2, n_trials=120)
model_2.kernel_model_template = MultiOutputRegressor(LGBMRegressor(**md2_params), n_jobs=-1)

In [None]:
md3_params = get_best_params_for_kernel_model(obj3, n_trials=120)
model_3.kernel_model_template = MultiOutputRegressor(LGBMRegressor(**md3_params), n_jobs=-1)

In [None]:
orchestrator.run_walk_forward_validation(4, max_train_months=30)

In [None]:
import app.core.config as cfg

In [None]:
orchestrator.run_final_training_and_registration(cfg.KAGGLE_USERNAME)