In [10]:
# Import required modules
import pandas as pd
import numpy as np
import time
import yfinance as yf
from datetime import datetime, timedelta
import joblib

from sentiment_utils import FinancialNewsAggregator
from features import compute_behavioral_features, label_reaction
from model_train import train_behavioral_model


In [11]:
def get_sp500_tickers():
    try:
        url = "https://datahub.io/core/s-and-p-500-companies/r/constituents.csv"
        df = pd.read_csv(url)
        return df['Symbol'].tolist()
    except Exception as e:
        print("‚ùå Failed to get tickers:", e)
        return []

raw_tickers = get_sp500_tickers()
print(f"‚úÖ Pulled {len(raw_tickers)} raw tickers")


‚úÖ Pulled 503 raw tickers


In [8]:
end = datetime.today()
start = end - timedelta(days=30)


In [None]:
def is_ticker_valid(ticker):
    try:
        df = yf.download(ticker, start=start, end=end, progress=False)
        return not df.empty and 'Close' in df.columns and df['Close'].notnull().any()
    except Exception:
        return False

tickers = [t for t in raw_tickers if is_ticker_valid(t)]
tickers = tickers[:10]  # limit to 10 for test
print(f"‚úÖ Valid tickers: {tickers}")



Failed to get ticker 'MMM' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['MMM']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')
Failed to get ticker 'AOS' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['AOS']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')
Failed to get ticker 'ABT' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['ABT']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')
Failed to get ticker 'ABBV' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['ABBV']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')
Failed to get ticker 'ACN' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['ACN']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')
Failed to get ticker 'ADBE' reason: Expecting value: line 1 column 1 (char 0)

1 Failed download:
['ADBE']: YFTzMissingError('$%

KeyboardInterrupt: 

Failed to get ticker 'BR' reason: Expecting value: line 1 column 1 (char 0)


In [None]:
records = []
news_aggregator = FinancialNewsAggregator()

In [None]:
def safe_yfinance_download(ticker, start, end, retries=2):
    """
    Attempts to download ticker data safely.
    Returns empty DataFrame on failure.
    """
    for attempt in range(retries):
        try:
            df = yf.download(ticker, start=start, end=end, progress=False)
            if df.empty or df.isnull().all().all() or 'Close' not in df.columns:
                raise ValueError("Empty or invalid data frame")
            return df
        except Exception as e:
            print(f"‚ö†Ô∏è Attempt {attempt+1} failed for {ticker}: {e}")
            time.sleep(1)
    print(f"‚ùå Giving up on {ticker} after {retries} attempts.")
    return pd.DataFrame()


In [None]:
for ticker in tickers:
    print(f"\n‚ñ∂ Processing: {ticker}")
    time.sleep(1)

    # üü© Market Data
    df = safe_yfinance_download(ticker, start, end)
    if df.empty:
        continue
    df = compute_behavioral_features(df)

    # ‚úÖ Sentiment Data
    try:
        news_data = news_aggregator.get_sentiment_data(
            ticker,
            start.strftime("%Y-%m-%d"),
            end.strftime("%Y-%m-%d")
        )
        if news_data.empty:
            print(f"‚ö†Ô∏è No news for {ticker}")
            continue
    except Exception as e:
        print(f"‚ö†Ô∏è Sentiment error: {e}")
        continue

    # ‚úÖ Sentiment Delta
    pre_news = news_data[news_data['date'] < end - timedelta(days=15)]
    post_news = news_data[news_data['date'] >= end - timedelta(days=15)]
    delta = news_aggregator.get_avg_sentiment(post_news) - news_aggregator.get_avg_sentiment(pre_news)
    label = label_reaction(delta)

    # ‚úÖ Final Record
    records.append({
        "ticker": ticker,
        "event_date": end.strftime("%Y-%m-%d"),
        "price_change": df['price_change'].mean(),
        "volatility": df['volatility'].mean(),
        "loss_aversion_score": df['loss_aversion_score'].mean(),
        "reaction_speed": df['reaction_speed'].mean(),
        "herding_index": df['herding_index'].mean(),
        "sentiment_delta": delta,
        "reaction_label": label
    })



‚ñ∂ Processing: MMM


Failed to get ticker 'MMM' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['MMM']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


‚ùå No market data.

‚ñ∂ Processing: AOS


Failed to get ticker 'AOS' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['AOS']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


‚ùå No market data.

‚ñ∂ Processing: ABT


Failed to get ticker 'ABT' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['ABT']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


‚ùå No market data.

‚ñ∂ Processing: ABBV


Failed to get ticker 'ABBV' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['ABBV']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


‚ùå No market data.

‚ñ∂ Processing: ACN


Failed to get ticker 'ACN' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['ACN']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


‚ùå No market data.


In [None]:
df_final = pd.DataFrame(records)
df_final.to_csv("live_training_data.csv", index=False)
df_final.head()


In [None]:
if df_final.empty:
    print("‚ùå No data collected. Skipping model training.")
else:
    model = train_behavioral_model(df_final)
    joblib.dump(model, "trained_behavioral_model.pkl")
    print("‚úÖ Model saved to trained_behavioral_model.pkl")


In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

plot_importance(model)
plt.title("Feature Importance")
plt.tight_layout()
plt.show()
