In [2]:
import os
import sys
from pathlib import Path
import cProfile
import pstats


import pandas as pd
import sqlite3
import numpy as np
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from Pattern.perceptually_important import find_pips
from sklearn.metrics import mean_squared_error
from Pattern.Utils.multi_config_recognizer import ConfigBasedRecognizer

from Data.Database.db import Database

from RL.Data.Utils.preprocessor import preprocess_data

 # Connect to database

db = Database("../../data/Storage/data.db")


Initializing database connection to ../../data/Storage/data.db
Connected to SQLite database: ../../data/Storage/data.db


# Main Function

In [None]:
from matplotlib import pyplot as plt


def load_training_data(stock_id=1,timeframe_id=5, start_date="2024-01-01", end_date="2025-01-01", window_size=48):
    """
    Load training data using a configurable pattern recognizer.
    
    Args:
        stock_id: ID of the stock to process
        start_date, end_date: Date range for data
        lookahead_hours: Hours to look ahead for outcome calculation
        window_size: Size of price window to analyze
        n_pips: Number of perceptually important points to extract
        mse_threshold: Maximum allowed MSE for pattern matching
        recognition_technique: Type of pattern recognizer ('svm', 'random_forest', 'ensemble', 'distance_based')
        **recognizer_params: Additional parameters for the chosen recognizer
    """
    
   
    # fetch stock data
    stock_data = db.get_stock_data_range(
            stock_id, timeframe_id, start_date, end_date
        )
    stock_data= preprocess_data(stock_data)
    
    #print(stock_data.head())
    close_prices = stock_data["close_price"].values
    
    # train the recognizer
    recognizer = ConfigBasedRecognizer(db, default_technique="svm")
    recognizer.train_recognizer(stock_id, timeframe_id)

    data_samples = []
    
    # get all configs for the stock
    configs = db.get_configs_by_stock_and_timeframe(stock_id, timeframe_id)
    clusters = db.get_clusters_by_stock_id(stock_id)

    
    for i in tqdm(range(window_size, len(close_prices) - window_size - 1), desc="Processing price windows"):
        date = stock_data.index[i]
        window = close_prices[i - window_size:i + 1] 
        
        best_cluster,_ = recognizer.predict_best_cluster(stock_id, timeframe_id, window, configs , clusters)
        
        if best_cluster is None:
            continue
        
        best_cluster = best_cluster.to_dict()

        # put the valudation crateria here for ex, (expected_value > 0.01 )
        if best_cluster["expected_value"] < 0.01:
            continue
        
        best_cluster_config_id = best_cluster["config_id"]
        # get the config data for the best cluster
        config = configs[configs["config_id"] == best_cluster_config_id]
        best_cluster_hold_period = config.iloc[0]["hold_period"]
    
        # === Calculate Actual Return ===
        
        current_price = close_prices[i]
        future_price = close_prices[i + best_cluster_hold_period ]
        actual_return = (future_price - current_price) / current_price
        
        # === Build Training Sample ===
        data_samples.append({
                "date": date,
                "config_id":best_cluster_config_id,
                "timeframe_id": timeframe_id,
                "probability": best_cluster["probability_score_dir"],
                "action":(lambda x: 1 if x == "Buy" else (2 if x == "Sell" else 0))(best_cluster["label"]),
                "reward_risk_ratio": best_cluster["reward_risk_ratio"],
                "max_gain": best_cluster["max_gain"],
                "max_drawdown": best_cluster["max_drawdown"],
                "mse": best_cluster["mse"],
                "expected_value": best_cluster["expected_value"],
                "rsi": stock_data.loc[date][ "rsi"],
                "atr": stock_data.loc[date]["atr"],
                "atr_ratio": stock_data.loc[date]["atr_ratio"],
                "unified_sentiment": stock_data.loc[date]["unified_sentiment"],
                "change_nonrept_long": stock_data.loc[date]["change_nonrept_long"],
                "change_nonrept_short": stock_data.loc[date]["change_nonrept_short"],
                "change_noncommercial_long": stock_data.loc[date]["change_noncommercial_long"],
                "change_noncommercial_short": stock_data.loc[date]["change_noncommercial_short"],
                "change_noncommercial_delta": stock_data.loc[date]["change_noncommercial_delta"],
                "change_nonreportable_delta": stock_data.loc[date]["change_nonreportable_delta"],
                "hour_sin": stock_data.loc[date]["hour_sin"],
                "hour_cos": stock_data.loc[date]["hour_cos"],
                "day_sin": stock_data.loc[date]["day_sin"],
                "day_cos": stock_data.loc[date]["day_cos"],
                "asian_session": stock_data.loc[date]["asian_session"],
                "london_session": stock_data.loc[date]["london_session"],
                "ny_session": stock_data.loc[date]["ny_session"]
    })

    return data_samples

In [None]:
  # Example usage
with cProfile.Profile() as pr:
    # Load training data for a specific stock and date range
    # Adjust the parameters as needed
    # Example: stock_id=1, timeframe_id=5, start_date="2024-01-01", end_date="2024-01-08", window_size=48
    data_samples=load_training_data(stock_id=1,timeframe_id=5, start_date="2024-01-01", end_date="2025-01-01")
    
stats_enhanced = pstats.Stats(pr)
stats_enhanced.sort_stats(pstats.SortKey.TIME)
stats_enhanced.dump_stats('rl_data_process.prof')


Processing price windows:   0%|          | 0/6115 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
data_samples = pd.DataFrame(data_samples)
data_samples

Unnamed: 0,date,config_id,timeframe_id,probability,action,reward_risk_ratio,max_gain,max_drawdown,mfe,mae,...,change_noncommercial_short,change_noncommercial_delta,change_nonreportable_delta,hour_sin,hour_cos,day_sin,day_cos,asian_session,london_session,ny_session
0,2024-01-04 03:00:00,63,5,0.705882,1,2.647162,0.004138,-0.001563,0.002835,0.000000,...,932.0,-69.0,3202.0,0.707107,7.071068e-01,0.433884,-0.900969,1.0,0.0,0.0
1,2024-01-04 04:00:00,64,5,0.772727,1,2.374708,0.002082,-0.000877,0.001497,-0.000739,...,932.0,-69.0,3202.0,0.866025,5.000000e-01,0.433884,-0.900969,1.0,0.0,0.0
2,2024-01-04 05:00:00,64,5,0.772727,1,2.374708,0.002082,-0.000877,0.001477,-0.000758,...,932.0,-69.0,3202.0,0.965926,2.588190e-01,0.433884,-0.900969,1.0,0.0,0.0
3,2024-01-04 06:00:00,64,5,0.772727,1,2.374708,0.002082,-0.000877,0.002413,0.000000,...,932.0,-69.0,3202.0,1.000000,6.123234e-17,0.433884,-0.900969,1.0,0.0,0.0
4,2024-01-04 07:00:00,64,5,0.772727,1,2.374708,0.002082,-0.000877,0.002080,0.000000,...,932.0,-69.0,3202.0,0.965926,-2.588190e-01,0.433884,-0.900969,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5834,2024-12-27 16:00:00,62,5,1.000000,1,3.000028,0.002764,-0.000921,0.001847,0.000000,...,850.0,-14412.0,2515.0,-0.866025,-5.000000e-01,-0.433884,-0.900969,0.0,0.0,1.0
5835,2024-12-27 17:00:00,79,5,0.625000,2,3.209205,-0.004798,0.001495,0.000917,-0.001085,...,850.0,-14412.0,2515.0,-0.965926,-2.588190e-01,-0.433884,-0.900969,0.0,0.0,1.0
5836,2024-12-27 18:00:00,71,5,0.750000,2,3.208704,-0.005791,0.001805,0.003258,-0.000982,...,850.0,-14412.0,2515.0,-1.000000,-1.836970e-16,-0.433884,-0.900969,0.0,0.0,1.0
5837,2024-12-27 19:00:00,62,5,1.000000,1,3.000028,0.002764,-0.000921,0.003051,-0.000241,...,850.0,-14412.0,2515.0,-0.965926,2.588190e-01,-0.433884,-0.900969,0.0,0.0,1.0


In [None]:
data_samples.to_csv("data_samples_output.csv", index=False)
conn = sqlite3.connect("../../RL/Data/Storage/samples.db")
data_samples.to_sql("dataset", conn, if_exists="replace", index=False)
conn.close()


In [9]:
conn = sqlite3.connect("../../RL/Data/Storage/samples.db")
df = pd.read_sql_query(f"SELECT * FROM dataset", conn)
conn.close()
print(df.head())

                  date  config_id  timeframe_id  probability  action  \
0  2024-01-04 03:00:00         63             5     0.705882       1   
1  2024-01-04 04:00:00         64             5     0.772727       1   
2  2024-01-04 05:00:00         64             5     0.772727       1   
3  2024-01-04 06:00:00         64             5     0.772727       1   
4  2024-01-04 07:00:00         64             5     0.772727       1   

   reward_risk_ratio  max_gain  max_drawdown       mfe       mae  ...  \
0           2.647162  0.004138     -0.001563  0.002835  0.000000  ...   
1           2.374708  0.002082     -0.000877  0.001497 -0.000739  ...   
2           2.374708  0.002082     -0.000877  0.001477 -0.000758  ...   
3           2.374708  0.002082     -0.000877  0.002413  0.000000  ...   
4           2.374708  0.002082     -0.000877  0.002080  0.000000  ...   

   change_noncommercial_short  change_noncommercial_delta  \
0                       932.0                       -69.0   
1     