In [15]:
!pip3 install requests beautifulsoup4



In [35]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import yfinance as yf

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def scrape_wikipedia_sp500():
    response = requests.get(WIKI_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the table containing S&P 500 companies
    table = soup.find('table', {'id': 'constituents'})
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    companies_data = []
    for row in rows:
        cols = row.find_all('td')
        ticker = cols[0].text.strip()
        company_name = cols[1].text.strip()
        sector = cols[3].text.strip()
        companies_data.append((ticker, company_name, sector))
    
    return companies_data

def get_sp500_sectors(companies_data):
    sectors = list(set(company[2] for company in companies_data))
    print(pd.DataFrame(sectors))
    return sectors

def get_sector_companies(sector, companies_data):
    companies = [company[0] for company in companies_data if company[2] == sector]
    return companies

def download_stock_data(ticker, start_date, end_date):
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)
        if df.empty:
            print(f"No data found for {ticker}")
            return None
        df = df[['Close']]
        return df
    except Exception as e:
        print(f"Error downloading data for {ticker}: {str(e)}")
        return None

def calculate_quarterly_yoy_growth(data):
    if data is None or data.empty:
        return None
    quarterly_data = data['Close'].resample('QE').last()
    yoy_growth = quarterly_data.pct_change(periods=4)
    return yoy_growth

def process_company(ticker, start_date, end_date):
    try:
        stock_data = download_stock_data(ticker, start_date, end_date)
        quarterly_yoy_growth = calculate_quarterly_yoy_growth(stock_data)
        return ticker, quarterly_yoy_growth
    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")
        return ticker, None

def process_sector(sector, start_date, end_date, companies_data):
    companies = get_sector_companies(sector, companies_data)
    if len(companies) <= 3:
        print(f"Skipping {sector} sector: Only {len(companies)} companies found.")
        return None

    print(f"Processing {sector} sector ({len(companies)} companies)...")
    results = {}
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_ticker = {executor.submit(process_company, ticker, start_date, end_date): ticker for ticker in companies}
        for future in as_completed(future_to_ticker):
            ticker, result = future.result()
            if result is not None and not result.empty:
                results[ticker] = result
            time.sleep(2)  # To avoid overwhelming the Yahoo Finance API

    if not results:
        print(f"No valid results for {sector} sector.")
        return None

    try:
        combined_results = pd.concat(results, axis=1)
        combined_results.columns = results.keys()  # Use simple column names (ticker strings)
        return combined_results
    except ValueError as e:
        print(f"Error combining results for {sector} sector: {str(e)}")
        return None


def main():
    # Set the date range from Q1 2015 to Q2 2024 (or current date if earlier)
    start_date = pd.Timestamp('2015-01-01')
    end_date = pd.Timestamp('2024-06-30')
    end_date = min(end_date, pd.Timestamp.now())

    # Scrape Wikipedia for S&P 500 companies and sectors
    companies_data = scrape_wikipedia_sp500()

    # Get all unique sectors
    sectors = get_sp500_sectors(companies_data)

    # Create a directory for output files
    output_dir = "sector_growth_results"
    os.makedirs(output_dir, exist_ok=True)

    # Process each sector
    for sector in sectors:
        sector_results = process_sector(sector, start_date, end_date, companies_data)
        if sector_results is not None:
            # Save results to CSV
            csv_filename = os.path.join(output_dir, f"{sector}_quarterly_yoy_growth.csv")
            sector_results.to_csv(csv_filename)
            print(f"Results saved to {csv_filename}")

            # Display summary statistics
            print("\nSummary Statistics:")
            print(sector_results.mean().sort_values(ascending=False))
            print("\n" + "="*50 + "\n")
        else:
            print(f"No results to save for {sector} sector.")
            print("\n" + "="*50 + "\n")

    print("All sectors processed.")


if __name__ == "__main__":
    main()

                                        0
0                  Packaged Foods & Meats
1    Fertilizers & Agricultural Chemicals
2                     Rail Transportation
3     Wireless Telecommunication Services
4                    Real Estate Services
..                                    ...
122                              Footwear
123                                  Gold
124          Diversified Support Services
125                 Electronic Components
126   Apparel, Accessories & Luxury Goods

[127 rows x 1 columns]
Processing Packaged Foods & Meats sector (12 companies)...
Date
2015-03-31 00:00:00-04:00         NaN
2015-06-30 00:00:00-04:00         NaN
2015-09-30 00:00:00-04:00         NaN
2015-12-31 00:00:00-05:00         NaN
2016-03-31 00:00:00-04:00    0.252401
2016-06-30 00:00:00-04:00    0.120022
2016-09-30 00:00:00-04:00    0.190614
2016-12-31 00:00:00-05:00    0.232954
2017-03-31 00:00:00-04:00    0.186780
2017-06-30 00:00:00-04:00   -0.018669
2017-09-30 00:00:00-04:00   

KeyboardInterrupt: 

In [37]:
d=pd.read_csv('sector_growth_results/Packaged Foods & Meats_quarterly_yoy_growth.csv')
d

Unnamed: 0,Date,CAG,CPB,GIS,K,HSY,KHC,HRL,MDLZ,LW,SJM,MKC,TSN
0,2015-03-31 00:00:00-04:00,,,,,,,,,,,,
1,2015-06-30 00:00:00-04:00,,,,,,,,,,,,
2,2015-09-30 00:00:00-04:00,,,,,,,,,,,,
3,2015-12-31 00:00:00-05:00,,,,,,,,,,,,
4,2016-03-31 00:00:00-04:00,0.252401,0.405741,0.154757,0.195096,-0.063911,,0.546123,0.129298,,0.147801,0.316274,0.757927
5,2016-06-30 00:00:00-04:00,0.120022,0.429515,0.319671,0.339292,0.311549,,0.318918,0.124284,,0.43773,0.343281,0.582391
6,2016-09-30 00:00:00-04:00,0.190614,0.102903,0.172206,0.195811,0.06712,0.316239,0.21658,0.066063,,0.213373,0.238211,0.749111
7,2016-12-31 00:00:00-05:00,0.232954,0.176347,0.103092,0.047519,0.187533,0.234396,-0.106192,0.005653,,0.060301,0.110544,0.168667
8,2017-03-31 00:00:00-04:00,0.18678,-0.082933,-0.040891,-0.025574,0.21502,0.187955,-0.18596,0.092312,,0.031087,-0.001663,-0.063325
9,2017-06-30 00:00:00-04:00,-0.018669,-0.197784,-0.199327,-0.12561,-0.031972,-0.005379,-0.051459,-0.034391,,-0.206349,-0.069007,-0.04973
