In [None]:
import pandas as pd
import numpy as np
import random
import requests
from bs4 import BeautifulSoup

# Set reproducibility
np.random.seed(42)


In [None]:
# Function to get S&P 500 tickers from Wikipedia
def get_sp500_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    headers = {'User-Agent': 'Mozilla/5.0'}  # ← add this to avoid being blocked
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'id': 'constituents'})
    
    if table is None:
        raise ValueError("❌ Could not find the S&P 500 table. Wikipedia may have changed the layout.")

    tickers = []
    for row in table.find_all('tr')[1:]:
        symbol = row.find_all('td')[0].text.strip()
        tickers.append(symbol.replace('.', '-'))  # e.g., BRK.B → BRK-B

    return tickers

tickers = get_sp500_tickers()
print(f"✅ Pulled {len(tickers)} S&P 500 tickers")




✅ Pulled 502 S&P 500 tickers


In [5]:
def generate_mock_data(tickers, n=2000):
    data = []

    for _ in range(n):
        ticker = random.choice(tickers)
        price_change = np.round(np.random.normal(0, 2), 2)
        volatility = np.round(abs(np.random.normal(1, 0.5)), 2)
        sentiment_score = np.round(np.random.uniform(-1, 1), 2)

        # Behavioral Features
        loss_aversion_score = max(0, -sentiment_score * price_change) if price_change < 0 else 0
        herding_index = np.random.randint(0, 10)
        reaction_speed = np.random.randint(1, 6)

        # Target (classification): 1 = market up, 0 = market down
        if sentiment_score > 0.2 and price_change > 0.5:
            target = 1
        elif sentiment_score < -0.2 and price_change < -0.5:
            target = 0
        else:
            target = random.choice([0, 1])

        data.append({
            'ticker': ticker,
            'price_change': price_change,
            'volatility': volatility,
            'sentiment_score': sentiment_score,
            'loss_aversion_score': loss_aversion_score,
            'herding_index': herding_index,
            'reaction_speed': reaction_speed,
            'target': target
        })

    return pd.DataFrame(data)



In [6]:
# Generate the dataset
df_mock = generate_mock_data(tickers, n=2000)

# Save to CSV
df_mock.to_csv("mock_training_data.csv", index=False)
print("✅ Saved mock_training_data.csv")

# View sample
df_mock.head()



✅ Saved mock_training_data.csv


Unnamed: 0,ticker,price_change,volatility,sentiment_score,loss_aversion_score,herding_index,reaction_speed,target
0,GE,0.99,0.93,0.46,0.0,4,2,1
1,IT,0.56,1.51,0.42,0.0,5,5,1
2,LW,-1.16,0.74,-0.63,0.0,9,5,0
3,PM,-3.45,0.72,0.22,0.759,9,4,0
4,MPWR,0.44,0.62,-0.24,0.0,3,1,0
