In [None]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import yfinance as yf
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def scrape_wikipedia_sp500():
    """Scrape S&P 500 companies and their sectors from Wikipedia."""
    response = requests.get(WIKI_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    table = soup.find('table', {'id': 'constituents'})
    rows = table.find_all('tr')[1:]  # Skip header row
    
    companies_data = []
    for row in rows:
        cols = row.find_all('td')
        ticker = cols[0].text.strip()
        company_name = cols[1].text.strip()
        sector = cols[3].text.strip()
        companies_data.append((ticker, company_name, sector))
    
    return companies_data

def download_market_cap_data(ticker, start_date, end_date):
    """Download the stock's market capitalization at the quarter end."""
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)
        if df.empty:
            return None

        # Calculate Market Cap = Close Price * Shares Outstanding
        shares_outstanding = stock.info.get('sharesOutstanding', None)
        if shares_outstanding is None:
            return None
        
        df['MarketCap'] = df['Close'] * shares_outstanding
        df = df[['MarketCap']]  # Only keep MarketCap column
        
        # Resample to get the last value at the end of each quarter
        df = df.resample('QE').last()
        df['Ticker'] = ticker  # Add ticker as a column
        df['Date'] = df.index.date  # Add Date column in YYYY-MM-DD format
        df.reset_index(drop=True, inplace=True)  # Drop original index to avoid duplication
        return df
    except Exception as e:
        return None

def calculate_returns(data):
    """Calculate quarterly returns based on market cap."""
    data['Return'] = data['MarketCap'].pct_change()
    return data.dropna(subset=['Return'])

def process_company(ticker, start_date, end_date):
    """Process each company and calculate the market cap at quarter end."""
    result = download_market_cap_data(ticker, start_date, end_date)
    if result is None:
        return None

    # Calculate returns for the company
    result = calculate_returns(result)
    return result

def main():
    start_date = pd.Timestamp('2019-10-31')
    end_date = pd.Timestamp('2024-10-31')

    # Scrape Wikipedia for S&P 500 companies
    companies_data = scrape_wikipedia_sp500()

    # Create a directory for output files
    output_dir = "company_mkt_cap_results"
    os.makedirs(output_dir, exist_ok=True)

    # Process each company
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_ticker = {executor.submit(process_company, company[0], start_date, end_date): company[0] for company in companies_data}
        
        for future in as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            result = future.result()

            if result is not None and not result.empty:
                # Save company results to CSV
                csv_filename = os.path.join(output_dir, f"{ticker}_mkt_cap_quarter_end.csv")
                result.to_csv(csv_filename, index=False)  # Save without index
                print(f"Results saved to {csv_filename}")
            else:
                print(f"No data found for {ticker}")

    print("All companies processed.")

if __name__ == "__main__":
    main()
