In [1]:
import sys
import os
from pathlib import Path
current_dir = Path(os.getcwd())
main_dir = str(current_dir.parent) 
sys.path.append(main_dir)


import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Pattern.perceptually_important import find_pips
from sklearn.metrics import mean_squared_error


from Data.db import Database
from Sentiment.alphavantage_api import get_news_sentiment_analysis
from sklearn.svm import SVC
db = Database("../Data/data.db")
from TwitterAPI_Sentiment import TwitterAPI
twitter_api = TwitterAPI(db)


Connected to offline sqlite database


  from .autonotebook import tqdm as notebook_tqdm


# Get Trainded SVM On CLusters

In [2]:
def get_svm_trained_model(stock_id):
    # === Load cluster data from DB ===
    clusters = db.get_clusters_by_stock_id(stock_id)
    cluster_features = clusters['AVGPricePoints'].values
    cluster_features = np.array([np.array(x.split(','), dtype=float) for x in cluster_features])
    labels = np.array([i for i in range(len(cluster_features))])
    
    # train the SVM model
    svm = SVC(kernel='rbf', probability=True)
    svm.fit(cluster_features, labels)
    
    return svm, clusters, cluster_features

# Get Stock Data

In [3]:
def get_stock_data(stock_id, start_date=None, end_date=None):
    stock_data = db.get_stock_data_range(stock_id,start_date, end_date)
    stock_data = stock_data.sort_index()
    prices =  stock_data['ClosePrice'].values
    return prices , stock_data


# Extract PIP

In [4]:
def extract_pips(window, n_pips=5, dist_type=3):
    """
    Extract and normalize PIPs from a price window.
    Returns normalized Y points or None if fails.
    """
    try:
       
        x, y = find_pips(window, n_pips, dist_type)
        scaler = MinMaxScaler()
        norm_y = scaler.fit_transform(np.array(y).reshape(-1, 1)).flatten()
        return x,norm_y
    except:
        return None

In [None]:
def load_training_data(stock_id=3, start_date="2024-01-01", end_date="2025-01-01", lookahead_hours=6, window_size=24 , n_pips=5 , mse_threshold=0.03):
    
    svm,clusters,cluster_features = get_svm_trained_model(stock_id)
    prices,stock_data = get_stock_data(stock_id, start_date, end_date)

    sentiment_cache = {}
    data_samples = []
    dates = stock_data.index
    
    # Variables for tracking unique patterns
    last_pips_x = [0] * n_pips
    unique_patterns_seen = []

    for i in range(window_size, len(dates) - lookahead_hours):
        date = dates[i]
        print(f"Processing date: {date}")
        window = prices[i-window_size:i+1]
        pips_x, pips_y = extract_pips(window, n_pips=n_pips, dist_type=3)
        
        if pips_y is None:
            continue
        
        # Convert to global indices
        start_i = i - window_size
        global_pips_x = [x + start_i for x in pips_x]
        
        #print(f"Processing window: {window}")
        
        # Check if this is a unique pattern
        is_unique = True
        for j in range(1, n_pips - 1):
            if global_pips_x[j] == last_pips_x[j]:
                is_unique = False
                break
                
        if not is_unique:
            print("Pattern already seen, skipping...")
            continue

        # Normalize the pattern
        scaler = MinMaxScaler()
        pips_y_normalized = scaler.fit_transform(np.array(pips_y).reshape(-1, 1)).flatten()

        # Store this pattern for future comparison
        last_pips_x = global_pips_x
        unique_patterns_seen.append(pips_y_normalized.tolist())
        
        # Predict cluster
        cluster_id = svm.predict(pips_y_normalized.reshape(1, -1))[0]
        
        # get the cluster actual index
        cluster_id_to_index = {id: idx for id, idx in enumerate(clusters.index)}
        actual_index = cluster_id_to_index[cluster_id]
        cluster = clusters.loc[actual_index]
        
        # check the mse between the predicted and actual cluster
        mse = mean_squared_error(cluster_features[cluster_id], pips_y_normalized)
        if mse > mse_threshold:
            print(f"Skipping due to high MSE: {mse}")
            continue
        
        current_price = prices[i]
        future_price = prices[i + lookahead_hours]
        actual_return = (future_price - current_price) / current_price
        
        # === Calculate MFE and MAE ===
        window_mfe_mae = prices[i:i+lookahead_hours]
        max_price = np.max(window_mfe_mae)
        min_price = np.min(window_mfe_mae)
        high = (max_price - current_price) / current_price
        low = (min_price - current_price) / current_price

        # === Get Sentiment (cache daily) ===
        date_key = str(date.date())
     
        if date_key not in sentiment_cache:
            sentiment_news = get_news_sentiment_analysis("AAPL", date_key)
            sentiment_twitter = twitter_api.get_tweets_sentiment_analysis(ticker_id=stock_id, specific_date=date_key)
            sentiment_cache[date_key] = {
                "impact_score": sentiment_news.get("Predicted Impact Score", 0),
                "news_score": sentiment_news.get("Predicted News Sentiment Score", 0),
                "twitter_score": sentiment_twitter.get("tweets_sentiment_score", 0),
            }

        sentiment = sentiment_cache[date_key]

        # === Build Training Sample ===
        data_samples.append({
            "date": date,
            "pattern": {
                "probability": cluster["ProbabilityScore"],
                "action":(lambda x: 1 if x == "Buy" else (2 if x == "Sell" else 0))(cluster["Label"]) ,
                "reward_risk_ratio":  abs(cluster["MaxGain"]) / abs(cluster["MaxDrawdown"] + 1e-6),
                "max_gain": cluster["MaxGain"],
                "max_drawdown": cluster["MaxDrawdown"],
                "high": high,
                "low": low,
            }
            ,
            "sentiment": sentiment,
            "price": current_price,
            "actual_return": actual_return
        })

    return data_samples

In [13]:
  # Example usage
data_samples = load_training_data(stock_id=2, start_date="2024-01-01", end_date="2024-02-01", lookahead_hours=6, window_size=24, n_pips=5, mse_threshold=0.03)
print(data_samples)

Processing date: 2024-01-02 00:00:00
Using stock ticker: AAPL
From date: 20240101T0000, To date: 20240102T0000
Found [45] tweets in database for period [2024-01-01 00:00:00] to [2024-01-02] for ticker_id: 2 and ticker_name: BTCUSD
Sufficient tweets found in database. Proceeding with analysis...
Loading tweets from database for period [2024-01-01] to [2024-01-02]
Loaded [45] tweets from database for date range 2024-01-01 00:00:00 to 2024-01-02 with ticker BTCUSD
Using CPU.

Sentiment Distribution:
sentiment_label
neutral     27
positive    18
Name: count, dtype: int64

Sentiment Score Statistics:
count    45.000000
mean      0.307784
std       0.394084
min       0.000000
25%       0.000000
50%       0.000000
75%       0.702808
max       0.970850
Name: sentiment_score, dtype: float64

Weighted Sentiment Statistics:
count    45.000000
mean      0.123119
std       0.232765
min       0.000000
25%       0.000000
50%       0.000000
75%       0.134035
max       1.000000
Name: final_weighted_se

In [None]:
import pandas as pd

# Assuming data_samples is your list of dicts
flat_data = []
for sample in data_samples:
    row = {
        "date": sample["date"],
        "probability": sample["pattern"]["probability"],
        "action": sample["pattern"]["action"],
        "reward_risk_ratio": sample["pattern"]["reward_risk_ratio"],
        "max_gain": sample["pattern"]["max_gain"],
        "max_drawdown": sample["pattern"]["max_drawdown"],
        "high": sample["pattern"]["high"],
        "low": sample["pattern"]["low"],
        "impact_score": sample["sentiment"]["impact_score"],
        "news_score": sample["sentiment"]["news_score"],
        "twitter_score": sample["sentiment"]["twitter_score"],
        "price": sample["price"],
        "actual_return": sample["actual_return"],
    }
    flat_data.append(row)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(flat_data)
df.to_csv("data_samples_output.csv", index=False)


In [15]:
import sqlite3

conn = sqlite3.connect("samples.db")
df.to_sql("rl_dataset", conn, if_exists="replace", index=False)
conn.close()


In [21]:
conn = sqlite3.connect('samples.db')
df = pd.read_sql_query(f"SELECT * FROM rl_dataset", conn)
conn.close()
print(df)

                   date  probability  action  reward_risk_ratio  max_gain  \
0   2024-01-02 00:00:00     0.546302       1           1.599424  0.015763   
1   2024-01-02 03:00:00     0.546302       1           1.599424  0.015763   
2   2024-01-03 02:00:00     0.533510       1           1.098526  0.011251   
3   2024-01-03 12:00:00     0.565257       1           1.226180  0.014635   
4   2024-01-03 13:00:00     0.536232       2           1.031629 -0.014404   
..                  ...          ...     ...                ...       ...   
61  2024-01-29 20:00:00     0.506554       2           1.003741 -0.012534   
62  2024-01-30 06:00:00     0.506554       2           1.003741 -0.012534   
63  2024-01-30 09:00:00     0.506554       2           1.003741 -0.012534   
64  2024-01-30 18:00:00     0.504244       1           0.989524  0.010936   
65  2024-01-31 11:00:00     0.553642       1           1.186008  0.013958   

    max_drawdown  impact_score  news_score  twitter_score     price  \
0   