In [15]:
!pip3 install requests beautifulsoup4



In [21]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import yfinance as yf

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def scrape_wikipedia_sp500():
    response = requests.get(WIKI_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the table containing S&P 500 companies
    table = soup.find('table', {'id': 'constituents'})
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    companies_data = []
    for row in rows:
        cols = row.find_all('td')
        ticker = cols[0].text.strip()
        company_name = cols[1].text.strip()
        sector = cols[3].text.strip()
        companies_data.append((ticker, company_name, sector))
    
    return companies_data

def get_sp500_sectors(companies_data):
    sectors = list(set(company[2] for company in companies_data))
    return sectors

def get_sector_companies(sector, companies_data):
    companies = [company[0] for company in companies_data if company[2] == sector]
    return companies

def download_stock_data(ticker, start_date, end_date):
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)
        if df.empty:
            print(f"No data found for {ticker}")
            return None
        df = df[['Close']]
        return df
    except Exception as e:
        print(f"Error downloading data for {ticker}: {str(e)}")
        return None

def calculate_quarterly_yoy_growth(data):
    if data is None or data.empty:
        return None
    quarterly_data = data['Close'].resample('Q').last()
    yoy_growth = quarterly_data.pct_change(periods=4)
    return yoy_growth

def process_company(ticker, start_date, end_date):
    try:
        stock_data = download_stock_data(ticker, start_date, end_date)
        quarterly_yoy_growth = calculate_quarterly_yoy_growth(stock_data)
        return ticker, quarterly_yoy_growth
    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")
        return ticker, None

def process_sector(sector, start_date, end_date, companies_data):
    companies = get_sector_companies(sector, companies_data)
    if len(companies) <= 3:
        print(f"Skipping {sector} sector: Only {len(companies)} companies found.")
        return None

    print(f"Processing {sector} sector ({len(companies)} companies)...")
    results = {}
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_ticker = {executor.submit(process_company, ticker, start_date, end_date): ticker for ticker in companies}
        for future in as_completed(future_to_ticker):
            ticker, result = future.result()
            if result is not None and not result.empty:
                results[ticker] = result
            time.sleep(2)  # To avoid overwhelming the Yahoo Finance API

    if not results:
        print(f"No valid results for {sector} sector.")
        return None

    try:
        combined_results = pd.concat(results, axis=1)
        combined_results.columns = results.keys()  # Use simple column names (ticker strings)
        return combined_results
    except ValueError as e:
        print(f"Error combining results for {sector} sector: {str(e)}")
        return None


def main():
    # Set the date range from Q1 2015 to Q2 2024 (or current date if earlier)
    start_date = pd.Timestamp('2015-01-01')
    end_date = min(pd.Timestamp('2024-06-30'))

    # Scrape Wikipedia for S&P 500 companies and sectors
    companies_data = scrape_wikipedia_sp500()

    # Get all unique sectors
    sectors = get_sp500_sectors(companies_data)

    # Create a directory for output files
    output_dir = "sector_growth_results"
    os.makedirs(output_dir, exist_ok=True)

    # Process each sector
    for sector in sectors:
        sector_results = process_sector(sector, start_date, end_date, companies_data)
        if sector_results is not None:
            # Save results to CSV
            csv_filename = os.path.join(output_dir, f"{sector}_quarterly_yoy_growth.csv")
            sector_results.to_csv(csv_filename)
            print(f"Results saved to {csv_filename}")

            # Display summary statistics
            print("\nSummary Statistics:")
            print(sector_results.mean().sort_values(ascending=False))
            print("\n" + "="*50 + "\n")
        else:
            print(f"No results to save for {sector} sector.")
            print("\n" + "="*50 + "\n")

    print("All sectors processed.")

if __name__ == "__main__":
    main()

TypeError: 'Timestamp' object is not iterable