In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

# Custom libraries
from Components.TrainModel import DataModule, TEMPUS, torchscript_predict
from Components.TickerData import TickerData, upload_data_sql, fetch_sql_data
from Components.BackTesting import BackTesting
from Components.MarketRegimes import MarketRegimes

# Torch ML libraries
import torch
import torch.nn as nn
from torch.optim import AdamW

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

In [None]:
#TODO: Include alpha in the backtesting results (based on index comparison) from quantstats package
#TODO: Streamlit Page for future prediction

# -- Model Features/Data --
#TODO: Use following news sentiment features: ["positive_count","neutral_count","negative_count","total_count","pos_sent_ratio","neg_sent_ratio","net_sentiment"]
#TODO: Parse financials data from Pologon.IO
#TODO: Include index volitility in the training data

# -- Model Training --

In [None]:
# Set the Wikipedia page title and section header
sample_size = 50

nasdaq_tickers = pd.read_html("https://en.wikipedia.org/wiki/Nasdaq-100")[4]
nasdaq_tickers = nasdaq_tickers.iloc[:, [1]].to_numpy().flatten()
nasdaq_tickers = np.random.choice(nasdaq_tickers, size=sample_size, replace=False)
rusell_tickers = pd.read_html("https://en.wikipedia.org/wiki/Russell_1000_Index")[3]
rusell_tickers = rusell_tickers.iloc[:, [1]].to_numpy().flatten()
rusell_tickers = np.random.choice(rusell_tickers, size=sample_size, replace=False)
SnP500_tickers = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
SnP500_tickers = SnP500_tickers.iloc[:, [0]].to_numpy().flatten()
SnP500_tickers = np.random.choice(SnP500_tickers, size=sample_size, replace=False)
SnP600_tickers = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_600_companies")[0]
SnP600_tickers = SnP600_tickers.iloc[:, [0]].to_numpy().flatten()
SnP600_tickers = np.random.choice(SnP600_tickers, size=sample_size, replace=False)

tickers = np.concatenate((nasdaq_tickers,rusell_tickers,SnP500_tickers,SnP600_tickers))
tickers = np.unique(tickers)

In [None]:
#tickers = ['IONQ']
indicators = ['ema_20', 'ema_50', 'ema_200', 'stoch_rsi14','stoch_rsi28','nasdaq_rsi14','macd', 'b_percent', 'keltner_lower', 'keltner_upper','State','bearish','bullish','hold','mixed','negative','neutral','positive','z_score','atr','price_momentum','volume_momentum','Close']
#indicators = ['bearish','bullish','hold','mixed','negative','neutral','positive']
training_data, raw_stock_data = TickerData(tickers,years=5,prediction_window=5,indicator_list=indicators).process_all()
training_data

In [None]:
#Best config: {'lr': 4.390449033248878e-05, 'hidden_size': 256, 'num_layers': 1, 'dropout': 0.3477694988633191, 'weight_decay': 0.0001801390872725824, 'batch_size': 16, 'window_size': 10, 'grad_clip_norm': 0.8393802881451728}

config = {
    "lr": 4.390449033248878e-05,
    "weight_decay": 0.0001801390872725824,
    "hidden_size": 256, # old was 256
    "num_layers": 1, # old was 1
    "dropout": 0.3477694988633191,
    "batch_size": 16, # old was 16
    "window_size": 5,
    "clip_size": 0.8393802881451728,
    "attention_heads": 4, #Deepseek R1 uses 128
    "epochs": 20,
    "device": "cuda" if torch.cuda.is_available() else "mps"
}

data_module = DataModule(training_data, window_size=config["window_size"], batch_size=config["batch_size"])
config["input_size"] = data_module.num_features

# Instantiate the model
model = TEMPUS(config,scaler=data_module.scaler).to(config["device"])
#model = torch.compile(model, backend="inductor",mode="default")
model
# Train Model
history = model.train_model(data_module.train_loader, data_module.val_loader, data_module.test_loader, config["epochs"])

In [None]:
training_fig = model.plot_training_history()
training_fig.show()

In [None]:
# Export the trained TEMPUS model
script_path = model.export_model_to_torchscript(
    save_path="Models/Echo_v1.0.pt",
    data_loader=data_module.test_loader,
    device="cpu"
)

In [None]:
import random
# Randomly sample 50 tickers from the SnP600_tickers list
sampled_tickers = random.sample(list(nasdaq_tickers), 10)
initial_capital = 1000.0

preds_dfs = []
returns = []
for idx, ticker in enumerate(sampled_tickers, start=1):
    out_of_sample_data, raw_stock_data = TickerData(ticker, years=4, prediction_window=5).process_all()

    # Check if raw_stock_data is NoneType, if so, skip this iteration
    if out_of_sample_data is not None:
        # Load the model and make predictions
        preds_df = torchscript_predict(
            model_path="Models/Tempus_v2.1.pt",
            input_df=out_of_sample_data,
            device="cpu",
            window_size=50,
            target_col="shifted_prices"
        )
        preds_df = pd.merge(preds_df, raw_stock_data[['Open', 'High', 'Low', 'Volume','Close']], left_index=True, right_index=True, how='left')
        preds_dfs.append(preds_df)

        backtester = BackTesting(preds_df, ticker, initial_capital, pct_change_entry=0.05, pct_change_exit=0.03)
        backtester.run_simulation()
        bt_results = pd.DataFrame(backtester.pf.returns())
        bt_results['cumulative_return'] = np.array(((1 + bt_results[0]).cumprod() - 1)*100)
        bt_results['ticker'] = ticker
        returns.append(bt_results)

preds_dfs = pd.concat(preds_dfs, ignore_index=False)
returns = pd.concat(returns, ignore_index=False)

In [None]:
# Calculate cumulative returns for each ticker and visualize them using Plotly
# Group data by 'ticker' and calculate cumulative returns

# Create an interactive plot using Plotly
fig = px.line(
    returns.reset_index(),
    x='index',
    y='cumulative_return',
    color='ticker',
    title='Cumulative Returns by Ticker',
    labels={'index': 'Date', 'cumulative_return': 'Cumulative Return'}
)

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Cumulative Return (%)',
    showlegend=False,
    height=600,
    template='ggplot2',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(visible=False),
        type="date"
    )
)
fig.show()

last_returns = returns.groupby('ticker')['cumulative_return'].last()

# Count positive and negative returns
positive_count = sum(last_returns > 0)
negative_count = sum(last_returns <= 0)
total_count = len(last_returns)

# Convert to DataFrame for visualization
last_returns_df = pd.DataFrame(last_returns).reset_index()
last_returns_df.columns = ['Ticker', 'Final Return']
last_returns_df.sort_values('Final Return', ascending=False, inplace=True)

# Create a simple pie chart showing the proportion
fig_pie = px.pie(
    values=[positive_count, negative_count],
    names=['Positive', 'Negative'],
    title='Proportion of Tickers with Positive vs Negative Returns',
    color_discrete_sequence=['green', 'red'],
    template='ggplot2',
)

fig_pie.update_traces(textinfo='percent+label').update_layout(showlegend=False)
fig_pie.show()

# Calculate the proportion of tickers with positive returns
if total_count > 0:
    positive_proportion = positive_count / total_count
    print(f"Proportion of tickers with positive cumulative returns: {positive_proportion:.2%}")
    print(f"Positive tickers: {positive_count} out of {total_count}")
    print(f"Negative tickers: {negative_count} out of {total_count}")
else:
    print("No ticker data available for analysis")

In [None]:
# Randomly select a ticker from the `preds_dfs` object
selected_ticker = random.choice(preds_dfs['Ticker'].unique())

# Filter the `preds_dfs` DataFrame for the selected ticker
preds_df = preds_dfs[preds_dfs['Ticker'] == selected_ticker]

# Update the plot to reflect the filtered data
fig = go.Figure()
fig.add_trace(go.Scatter(y=preds_df['Predicted'], x=preds_df.index, mode='lines', name='Predicted', line=dict(color="Grey")))
fig.add_trace(go.Scatter(y=preds_df['Close'], x=preds_df.index, mode='lines', name='Close (Unshifted)', line=dict(color="Blue")))
fig.add_trace(go.Scatter(y=preds_df['Actual'], x=preds_df.index, mode='lines', name='Close (Shifted)'))
fig.update_layout(
    title=f'Prediction for {selected_ticker}',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    height=600,
    legend=dict(orientation="h", yanchor="bottom", y=1.02),
    template='ggplot2'
)
fig.show()

In [None]:
from Components.BackTesting import BackTesting
import pandas as pd
ticker = 'PLTR'
out_of_sample_data, raw_stock_data = TickerData(ticker, years=1, prediction_window=5,prediction_mode=True).process_all()

preds_df = torchscript_predict(
    model_path="Models/Tempus_v2.2.pt",
    input_df=out_of_sample_data,
    device="cpu",
    window_size=50,
    prediction_mode=True
)
preds_df = pd.merge(preds_df, raw_stock_data[['Open', 'High', 'Low', 'Volume','Close']], left_index=True, right_index=True, how='left')
preds_df['shifted_prices'] = preds_df['Close'].shift(-abs(5))

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=preds_df['Predicted'], x=preds_df.index, mode='lines', name='Predicted', line=dict(color="Grey")))
fig.add_trace(go.Scatter(y=preds_df['shifted_prices'], x=preds_df.index, mode='lines', name='Close (Shifted)', line=dict(color="Blue")))
fig.add_trace(go.Scatter(y=preds_df['Close'], x=preds_df.index, mode='lines', name='Close (Unshifted)', line=dict(color="Orange")))
fig.update_layout(template='ggplot2')
fig.show()

In [None]:
import quantstats as qs

backtester = BackTesting(preds_df, ticker, initial_capital, pct_change_entry=0.05,pct_change_exit=0.02)
backtester.run_simulation()
returns = backtester.pf.returns()
returns.index = returns.index.tz_localize(None)

#html = qs.reports.full(returns, "NDAQ")
qs.reports.basic(returns, "PLTR",rf=0.0025, display=False)


In [None]:
df_sentiment = training_data
df_sentiment['total_count'] = df_sentiment['positive'] + df_sentiment['negative'] + df_sentiment['neutral']
df_sentiment['pos_sent_ratio'] = df_sentiment['positive'] / df_sentiment['total_count']
df_sentiment['neg_sent_ratio'] = df_sentiment['negative'] / df_sentiment['total_count']
df_sentiment['neu_sent_ratio'] = df_sentiment['neutral'] / df_sentiment['total_count']
df_sentiment['net_sentiment'] = (df_sentiment['positive'] - df_sentiment['negative']) / df_sentiment['total_count']
df_sentiment['roll3_net'] = df_sentiment['net_sentiment'].rolling(3).mean()
df_sentiment.fillna(0, inplace=True)
df_sentiment

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# ————————————
# 1) Prepare your data
# ————————————
data_module = DataModule(training_data, window_size=10, batch_size=32,target_col='shifted_prices')
# ————————————
# 2) Define the LSTM model
# ————————————
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=1, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.out = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (batch, seq_len, features)
        _, (hn, _) = self.lstm(x)
        # take the last layer’s hidden state
        h_last = hn[-1]               # shape (batch, hidden_dim)
        return self.out(h_last).squeeze(-1)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" ) #mps
model = LSTMRegressor(input_dim=data_module.num_features).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# ————————————
# 3) Training loop
# ————————————
EPOCHS = 20
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    for xb, yb in data_module.train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        if pred.dim() > 1:
            pred = pred[:, -1, 0] if pred.size(1) > 1 and pred.size(2) > 0 else pred.squeeze()
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)

    avg_loss = total_loss / len(data_module.train_loader.dataset)
    print(f"Epoch {epoch:2d} — Train MSE: {avg_loss}")

In [None]:
# ————————————
# 4) SHAP feature‐importance
# ————————————
import numpy as np
import shap
import torch
import matplotlib.pyplot as plt

# 1) Pull one batch from each loader
#    Assumes each batch is a tuple (x, y) where
#      x.shape == [batch_size, seq_len, n_features]
train_batch, _ = next(iter(data_module.train_loader))
test_batch,  _ = next(iter(data_module.test_loader))

# Move to CPU / numpy for KernelExplainer
# and infer seq_len & n_features
train_np = train_batch.cpu().numpy()
test_np  = test_batch.cpu().numpy()
batch_size, seq_len, n_features = train_np.shape

# 2) Build your background set (<=100 samples), flattened
bg       = train_np[:100]                             # (B_bg, seq_len, n_feat)
bg_flat  = bg.reshape(bg.shape[0], -1)                # (B_bg, seq_len*n_feat)

# flatten all of test for explainer
test_flat = test_np.reshape(test_np.shape[0], -1)     # (batch_size, seq_len*n_feat)

# 3) Define a “flatten→3D→model→1D” wrapper
def predict_flat(x_flat):
    # x_flat: array shape (B, seq_len*n_feat) or (seq_len*n_feat,)
    arr = np.array(x_flat)
    arr = arr.reshape(-1, seq_len, n_features)        # back to (B, seq_len, n_feat)
    t   = torch.from_numpy(arr).float().to(device)
    model.eval()
    with torch.no_grad():
        out = model(t).cpu().numpy().reshape(-1)      # (B,) scalar outputs
    return out

# 4) KernelExplainer on the flattened background
explainer      = shap.KernelExplainer(predict_flat, bg_flat)

# pick up to 50 test windows, but don’t assume you actually have 50
n_explain = min(50, test_np.shape[0])
shap_vals_flat = explainer.shap_values(test_flat[:n_explain])

# if multi-output, grab the first list-element
if isinstance(shap_vals_flat, list):
    shap_vals_flat = shap_vals_flat[0]

# now reshape using the correct n_explain
shap_vals = np.array(shap_vals_flat).reshape(n_explain, seq_len, n_features)

# 6) collapse the time axis and plot
mean_abs_time = np.mean(np.abs(shap_vals), axis=0)  # (seq_len, n_feat)
feat_imp      = mean_abs_time.mean(axis=0)         # (n_feat,)

feature_names = list(training_data.drop(columns=["shifted_prices"]).columns)

fig = go.Figure()
fig.add_trace(go.Bar(y=feat_imp, x=feature_names))
fig.update_layout(
    title="Feature importance (averaged over time)",
    xaxis_title='Feature Names',
    yaxis_title='Mean |SHAP value|',
    template='ggplot2'
)
fig.show()

In [1]:
from Components.Fundamentals import FundementalData
from Components.Fundamentals import search_line_items
from Components.AgentManager import AgentManager
from Agents.phil_fisher import PhilFisherAgent
from Agents.ben_graham import BenGrahamAgent
from Agents.bill_ackman import BillAckmanAgent
from Agents.cathie_wood import CathieWoodAgent
from Agents.charlie_munger import CharlieMungerAgent
import warnings

warnings.filterwarnings('ignore')

tickers = ['IONQ','PLTR']
financials = FundementalData(tickers, workers=5,fetch_stock_price=False,fetch_market_cap=True).fetch()
manager = AgentManager(tickers, financials, [PhilFisherAgent, BenGrahamAgent, BillAckmanAgent, CathieWoodAgent, CharlieMungerAgent])

manager.agent_analysis()

{'IONQ': {'PhilFisherAgent': {'signal': 'bearish',
   'score': 2.5,
   'max_score': 10,
   'growth_quality': {'score': 5.555555555555555,
    'details': 'Very strong multi-period revenue growth: 1952.1%; Oldest EPS near zero; skipping EPS growth calculation.; R&D ratio 317.7% is very high (could be good if well-managed)'},
   'margins_stability': {'score': 0.0,
    'details': 'Operating margin may be negative or uncertain; Low gross margin: -769.8%; Operating margin volatility is high'},
   'management_efficiency': {'score': 1.6666666666666665,
    'details': 'Recent net income is zero or negative, hurting ROE; Manageable debt-to-equity: 0.32; Free cash flow is inconsistent or often negative'},
   'valuation_analysis': {'score': 0.0,
    'details': 'No positive net income for P/E calculation; No positive free cash flow for P/FCF calculation'},
   'insider_activity': {'score': 5,
    'details': 'No insider trades data; defaulting to neutral'},
   'sentiment_analysis': {'score': 5,
    '

In [None]:
import os
# %%
# Import stock_data dataframe into an Azure SQL database table using SQLAlchemy
#upload_data_sql(stock_data,"SNP600_1day")
#SNP500_1day = fetch_sql_data('SNP500_1day')
#SNP600_1day = fetch_sql_data('SNP600_1day')
#russell2000_1day = fetch_sql_data('russell2000_1day')
#dowjones_1day = fetch_sql_data('dowjones_1day')
#nasdaq_1day = fetch_sql_data('nasdaq_1day')
#stock_data = pd.concat([SNP500_1day, SNP600_1day, dowjones_1day, nasdaq_1day], ignore_index=True)
# Remove duplicates based on the 'Date' and 'Ticker' columns
#stock_data = stock_data[~stock_data.index.duplicated(keep='first')]
# Before conversion
#print("Column types before:", [type(col).__name__ for col in training_data.columns])

# Apply conversion
#training_data.columns = [str(col) for col in training_data.columns]

# After conversion
#print("Column types after:", [type(col).__name__ for col in training_data.columns])