In [1]:
from utilities import database as db
import pandas as pd
import numpy as np
import requests
import yfinance as yf
from datetime import datetime
import time
import logging

from config import FMP_SP500_CONSTITUENT_URL
from tqdm import tqdm


In [2]:
### Additional setings ###

# Logging
logging.basicConfig(
    filename="data_collection.log",
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Get Historical Constituesnts of SP500
**NOTE:** returns periods when ticker (stock) was added into SP500 and revomed<br>
date_removed equals to *2262-01-01* in such cases when stock is currently in SP500.

In [3]:
def get_sp500_hist():
    resp = requests.get(FMP_SP500_CONSTITUENT_URL)

    if resp.status_code != 200:
        return None

    df = pd.DataFrame.from_records(resp.json())

    removes = df[df['removedTicker'] != ''][['date', 'removedTicker']].dropna(ignore_index=True)
    removes.columns = ['date_removed', 'Ticker']

    adds = df[df['symbol'] != ''][['date', 'symbol']].dropna(ignore_index=True)
    adds.columns = ['date_added', 'Ticker']

    rotations = adds.merge(removes, how='left', on=['Ticker'])
    rotations['date_removed'] = rotations['date_removed'].fillna('2262-01-01')
    rotations = rotations[rotations['date_added'] < rotations['date_removed']]
    rotations = rotations.sort_values(by=['Ticker','date_removed'], ascending=True)
    rotations = rotations.drop_duplicates(subset=['Ticker','date_removed'], keep='first', ignore_index=True)
    rotations['date_added'] = pd.to_datetime(rotations['date_added'], format='%Y-%m-%d')
    rotations['date_removed'] = pd.to_datetime(rotations['date_removed'], format='%Y-%m-%d')
    rotations.columns = [col.lower() for col in rotations.columns]
    return rotations


historical_constituents = get_sp500_hist()
historical_constituents

Unnamed: 0,date_added,ticker,date_removed
0,1988-10-06,AAL,1997-01-14
1,2015-03-20,AAL,2024-09-23
2,2015-07-08,AAP,2023-08-24
3,1982-11-18,AAPL,2262-01-01
4,1957-03-03,ABA,1973-05-31
...,...,...,...
1385,2001-08-06,ZBH,2262-01-01
1386,2019-12-20,ZBRA,2262-01-01
1387,2001-06-22,ZION,2024-03-15
1388,1982-03-11,ZRN,1995-12-19


### Write into the DB

In [4]:
historical_constituents.to_sql('sp500_hist_constituents', db.get_engine(), if_exists='replace', index=False)

390

### Read from the DB

In [5]:
df = pd.read_sql('SELECT * from sp500_hist_constituents', db.get_engine())
df

Unnamed: 0,date_added,ticker,date_removed
0,1988-10-06,AAL,1997-01-14
1,2015-03-20,AAL,2024-09-23
2,2015-07-08,AAP,2023-08-24
3,1982-11-18,AAPL,2262-01-01
4,1957-03-03,ABA,1973-05-31
...,...,...,...
1385,2001-08-06,ZBH,2262-01-01
1386,2019-12-20,ZBRA,2262-01-01
1387,2001-06-22,ZION,2024-03-15
1388,1982-03-11,ZRN,1995-12-19


# Collect Stock Prices
**NOTE:** Script takes all stocks seen in SP500 at least ones and requests data from YahooFinance (old stocks may not exist in YF database)

In [None]:
def fetch_stock_prices(ticker, start_date=None, end_date=None, interval="1d"):
    stock = yf.Ticker(ticker)

    try:
        if end_date is None:
            data = stock.history(period="max", interval=interval)
        else:
            data = stock.history(start=start_date, end=end_date, interval=interval)
    except:
        return None
    data.reset_index(inplace=True)
    data['ticker'] = ticker
    data.drop_duplicates(subset=['Date'])
    data['Date'] = pd.to_datetime(data['Date']) 
    data.columns = [col.lower() for col in data.columns]
    return data

def get_sp_500_prices():

    tickers_query = '''
    SELECT DISTINCT ticker as Ticker
    FROM sp500_hist_constituents as sph
    --WHERE ticker not in (
    --    SELECT DISTINCT ticker
    --    FROM sp500_constituents_hist_prices_raw)
    '''

    df = pd.read_sql(tickers_query, db.get_engine())
    tickers = list(df['ticker'].unique())

    no_data_tickers, data_tickers = [], []
    strategy = 'replace'

    for ticker in tqdm(tickers):
        data = fetch_stock_prices(ticker)

        if (data is not None) and (not data.empty):

            try:
                
                data.to_sql(
                    'sp500_constituents_hist_prices_raw',
                    db.get_engine(),
                    chunksize=1000,
                    if_exists=strategy,
                    index=False
                    )

                logging.info(f'[OK] - {ticker} - {data.shape[0]}')
                data_tickers.append(ticker)
            except:
                logging.info(f'[FAIL] - {ticker} - -1')
                no_data_tickers.append(ticker)
        else:
            logging.info(f'[FAIL] - {ticker} - 0')
            no_data_tickers.append(ticker)

        strategy = 'append'
        time.sleep(0.5)

    return {'OK' : data_tickers, 'FAILED' : no_data_tickers}



In [7]:
stats = get_sp_500_prices()

KeyError: 'Ticker'

In [None]:
1339

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
0,2025-05-09 04:00:00+00:00,11.11,11.23,10.93,11.11,59485000,0.0,0.0,AAL
1,2025-05-09 04:00:00+00:00,31.5,31.870001,30.84,30.879999,2267400,0.0,0.0,AAP
2,2025-05-09 04:00:00+00:00,199.0,200.539993,197.539993,198.529999,36415700,0.0,0.0,AAPL
3,2025-05-09 04:00:00+00:00,185.75,189.770004,184.229996,184.600006,7806100,0.0,0.0,ABBV
