In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import yfinance as yf
import time

In [4]:
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def scrape_wikipedia_sp500():
    """Scrape S&P 500 companies and their sectors from Wikipedia."""
    response = requests.get(WIKI_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    table = soup.find('table', {'id': 'constituents'})
    rows = table.find_all('tr')[1:]  # Skip header row
    
    companies_data = []
    for row in rows:
        cols = row.find_all('td')
        ticker = cols[0].text.strip()
        company_name = cols[1].text.strip()
        sector = cols[3].text.strip()
        companies_data.append((ticker, company_name, sector))
    
    return companies_data

In [None]:




def get_sp500_sectors(companies_data):
    """Return a list of unique sectors."""
    return list(set(company[2] for company in companies_data))

def get_sector_companies(sector, companies_data):
    """Return the list of companies belonging to a sector."""
    return [company[0] for company in companies_data if company[2] == sector]

def download_market_cap_data(ticker, start_date, end_date):
    """Download the stock's market capitalization at the quarter end."""
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)
        if df.empty:
            print(f"No data found for {ticker}")
            return None

        # Calculate Market Cap = Close Price * Shares Outstanding
        shares_outstanding = stock.info.get('sharesOutstanding', None)
        if shares_outstanding is None:
            print(f"No shares outstanding data for {ticker}")
            return None
        
        df['MarketCap'] = df['Close'] * shares_outstanding
        df = df[['MarketCap']]  # Only keep MarketCap column
        
        # Resample to get the last value at the end of each quarter
        df = df.resample('QE').last()
        df['Ticker'] = ticker  # Add ticker as a column
        return df
    except Exception as e:
        print(f"Error downloading data for {ticker}: {str(e)}")
        return None

def process_sector(sector, start_date, end_date, companies_data):
    """Process each sector and calculate the market cap at quarter end."""
    companies = get_sector_companies(sector, companies_data)
    if len(companies) < 3:
        print(f"Skipping {sector} sector: Only {len(companies)} companies found.")
        return None

    print(f"Processing {sector} sector ({len(companies)} companies)...")
    print(companies)
    results = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_ticker = {executor.submit(download_market_cap_data, ticker, start_date, end_date): ticker for ticker in companies}
        for future in as_completed(future_to_ticker):
            ticker, result = future_to_ticker[future], future.result()
            if result is not None and not result.empty:
                results.append(result)
            time.sleep(1)  # To avoid overwhelming the Yahoo Finance API

    if not results:
        print(f"No valid results for {sector} sector.")
        return None

    try:
        # Combine the results into a single DataFrame
        combined_results = pd.concat(results)
        combined_results.reset_index(inplace=True)  # Reset index to get Date as a column
        combined_results.rename(columns={'index': 'Date'}, inplace=True)  # Rename the index column to Date
        return combined_results
    except ValueError as e:
        print(f"Error combining results for {sector} sector: {str(e)}")
        return None

def main():
    # Set the date range from Q1 2015 to Q2 2024 (or current date if earlier)
    start_date = pd.Timestamp('2019-09-01')
    end_date = pd.Timestamp('2024-09-30')
    end_date = min(end_date, pd.Timestamp.now())

    # Scrape Wikipedia for S&P 500 companies and sectors
    companies_data = scrape_wikipedia_sp500()

    # Get all unique sectors
    sectors = get_sp500_sectors(companies_data)

    # Create a directory for output files
    output_dir = "sector_mkt_cap_results"
    os.makedirs(output_dir, exist_ok=True)

    # Process each sector
    for sector in sectors:
        sector_results = process_sector(sector, start_date, end_date, companies_data)
        if sector_results is not None:
            # Save results to CSV
            csv_filename = os.path.join(output_dir, f"{sector}_mkt_cap_quarter_end.csv")
            sector_results.to_csv(csv_filename, index=False)  # Save without index
            print(f"Results saved to {csv_filename}")

            # Display summary
            print(f"Summary for {sector} sector:")
            print(sector_results.head())
            print("\n" + "="*50 + "\n")
        else:
            print(f"No results to save for {sector} sector.")
            print("\n" + "="*50 + "\n")

    print("All sectors processed.")

if __name__ == "__main__":
    main()


In [None]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import yfinance as yf
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def scrape_wikipedia_sp500():
    """Scrape S&P 500 companies and their sectors from Wikipedia."""
    response = requests.get(WIKI_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    table = soup.find('table', {'id': 'constituents'})
    rows = table.find_all('tr')[1:]  # Skip header row
    
    companies_data = []
    for row in rows:
        cols = row.find_all('td')
        ticker = cols[0].text.strip()
        company_name = cols[1].text.strip()
        sector = cols[3].text.strip()
        companies_data.append((ticker, company_name, sector))
    
    return companies_data

def get_sector_companies(sector, companies_data):
    """Return the list of companies belonging to a sector."""
    return [company[0] for company in companies_data if company[2] == sector]

def download_market_cap_data(ticker, start_date, end_date):
    """Download the stock's market capitalization at the quarter end."""
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)
        if df.empty:
            return None

        # Calculate Market Cap = Close Price * Shares Outstanding
        shares_outstanding = stock.info.get('sharesOutstanding', None)
        if shares_outstanding is None:
            return None
        
        df['MarketCap'] = df['Close'] * shares_outstanding
        df = df[['MarketCap']]  # Only keep MarketCap column
        
        # Resample to get the last value at the end of each quarter
        df = df.resample('QE').last()
        df['Ticker'] = ticker  # Add ticker as a column
        return df
    except Exception as e:
        return None

def calculate_returns(data):
    """Calculate quarterly returns based on market cap."""
    data['Return'] = data['MarketCap'].pct_change()
    return data.dropna(subset=['Return'])

def calculate_beta(sector_returns, market_returns):
    """Calculate beta using aligned sector and market returns."""
    # Align the series to have matching dates
    aligned_returns = sector_returns.align(market_returns, join='inner')
    aligned_sector_returns = aligned_returns[0]
    aligned_market_returns = aligned_returns[1]
    
    # Calculate covariance and beta
    covariance = np.cov(aligned_sector_returns, aligned_market_returns)[0, 1]
    market_variance = np.var(aligned_market_returns)
    beta = covariance / market_variance
    return beta


def process_sector_with_beta(sector, start_date, end_date, companies_data, market_returns):
    """Calculate quarterly returns and beta for a sector."""
    companies = get_sector_companies(sector, companies_data)
    results = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_ticker = {executor.submit(download_market_cap_data, ticker, start_date, end_date): ticker for ticker in companies}
        for future in as_completed(future_to_ticker):
            result = future.result()
            if result is not None:
                results.append(result)

    if not results:
        return None

    # Combine the results into a single DataFrame
    combined_results = pd.concat(results)
    combined_results.reset_index(inplace=True)
    
    # Calculate sector returns
    sector_returns = combined_results.groupby('Date')['MarketCap'].sum()
    sector_returns = calculate_returns(sector_returns.to_frame())

    # Calculate beta
    beta_value = calculate_beta(sector_returns['Return'], market_returns)
    return beta_value

def main():
    start_date = pd.Timestamp('2019-09-30')
    end_date = pd.Timestamp('2024-09-30')
    
    # Scrape Wikipedia for S&P 500 companies and sectors
    companies_data = scrape_wikipedia_sp500()
    
    # Get unique sectors
    sectors = list(set(company[2] for company in companies_data))
    
    # Load market index data for beta calculation (assuming S&P 500)
    market_data = yf.Ticker('^GSPC').history(start=start_date, end=end_date)
    market_data = market_data.resample('QE').last()  # Quarterly end
    market_data['Return'] = market_data['Close'].pct_change().dropna()
    market_returns = market_data['Return']
    
    # Calculate and store sector betas
    sector_betas = {}
    for sector in sectors:
        beta_value = process_sector_with_beta(sector, start_date, end_date, companies_data, market_returns)
        if beta_value is not None:
            sector_betas[sector] = beta_value
    
    # Save to CSV
    beta_df = pd.DataFrame(sector_betas.items(), columns=['Sector', 'Beta'])
    beta_df.to_csv('sector_beta_values.csv', index=False)
    print("Sector beta values saved to sector_beta_values.csv")

if __name__ == "__main__":
    main()


In [None]:
def classify_growth(value):
    """Classify YoY growth into 1, 0, or -1 based on thresholds."""
    if value > 5:
        return 1
    elif value < -5:
        return -1
    else:
        return 0

def calculate_yoy_growth(df, column):
    """Calculate YoY growth and classify based on threshold."""
    df[f'{column}_YoY_Growth'] = df[column].pct_change(periods=4) * 100  # YoY percentage change
    df[f'{column}_Growth_Class'] = df[f'{column}_YoY_Growth'].apply(classify_growth)
    df.dropna(subset=[f'{column}_YoY_Growth'], inplace=True)
    return df

def calculate_beta_growth_covariance(df, period):
    """Calculate the covariance of beta growth over a specified period."""
    df[f'Beta_Growth_{period}_M'] = df['Beta'].pct_change(periods=period)
    beta_covariance = df[f'Beta_Growth_{period}_M'].cov(df['Beta'])
    return beta_covariance

def calculate_sector_index_variations(df, sorted_companies, total_overperformance):
    """Calculate sector index and return variations between weighted and simple average index."""
    fractional_contribution = {company: count**2 / total_overperformance for company, count in sorted_companies}

    sector_index = pd.DataFrame()

    for ticker in df['Ticker'].unique():
        company_data = df[df['Ticker'] == ticker].copy()

        if ticker in fractional_contribution:
            contribution = fractional_contribution[ticker]
            company_data['Weighted_YoY_Growth'] = company_data['MarketCap_Growth_Class'] * contribution
        else:
            continue

        if sector_index.empty:
            sector_index = company_data[['Date', 'Weighted_YoY_Growth']].copy()
        else:
            sector_index = pd.merge(sector_index, company_data[['Date', 'Weighted_YoY_Growth']],
                                    on='Date', how='outer', suffixes=('', f'_{ticker}'))

    sector_index['Weighted_Index'] = sector_index.filter(like='Weighted_YoY_Growth').sum(axis=1)

    # Calculate simple average
    simple_avg_index = df.groupby('Date')['MarketCap_Growth_Class'].mean().reset_index()
    sector_index = pd.merge(sector_index, simple_avg_index, on='Date', how='left')
    sector_index.rename(columns={'MarketCap_Growth_Class': 'Simple_Avg_Index'}, inplace=True)

    # Calculate the difference between the weighted and simple averages
    sector_index['Difference'] = (sector_index['Weighted_Index'] - sector_index['Simple_Avg_Index']).abs()

    return sector_index[['Date', 'Weighted_Index', 'Simple_Avg_Index', 'Difference']]

def calculate_variance_and_covariances(df):
    """Calculate variance of YoY growth and the covariances of beta growth."""
    # Variance of YoY growth in market cap
    variance = df['MarketCap_Growth_Class'].var()

    # Covariance for 6 months and 5 years of beta growth
    beta_cov_6m = calculate_beta_growth_covariance(df, period=6)
    beta_cov_5y = calculate_beta_growth_covariance(df, period=20)  # Assuming 5 years corresponds to approx. 20 quarters

    return variance, beta_cov_6m, beta_cov_5y

def get_sector_rankings(input_dir, output_file='sector_rankings.csv'):
    rankings_variation = {}
    rankings_variance = {}
    rankings_cov_6m_beta = {}
    rankings_cov_5y_beta = {}

    for sector_file in os.listdir(input_dir):
        if sector_file.endswith(".csv"):
            sector = sector_file.replace("_mkt_cap_quarter_end.csv", "")
            print(f"Processing {sector} sector...")

            # Load the CSV file and filter data for 2019-2024
            file_path = os.path.join(input_dir, sector_file)
            df = pd.read_csv(file_path, parse_dates=['Date'])

            # Convert 'Date' to UTC
            df['Date'] = pd.to_datetime(df['Date'], utc=True)

            # Filter the data from 2019 onwards
            df = df[df['Date'] >= pd.Timestamp('2019-01-01', tz='UTC')]

            # Calculate YoY growth for MarketCap and Revenue
            df = calculate_yoy_growth(df, 'MarketCap')
            df = calculate_yoy_growth(df, 'Revenue')

            # Calculate sector leader and fractional contributions
            sector_leader, sorted_companies, total_overperformance = calculate_sector_leader_and_rank(df, sector)

            # Calculate sector index variations
            sector_index_variations = calculate_sector_index_variations(df, sorted_companies, total_overperformance)

            # Calculate variance of YoY growth and beta covariances
            variance, beta_cov_6m, beta_cov_5y = calculate_variance_and_covariances(df)

            # Summarize the variation between weighted and simple averages
            avg_difference = sector_index_variations['Difference'].mean()

            # Save results for rankings
            rankings_variation[sector] = avg_difference
            rankings_variance[sector] = variance
            rankings_cov_6m_beta[sector] = beta_cov_6m
            rankings_cov_5y_beta[sector] = beta_cov_5y

    # Create a DataFrame from the rankings
    rankings_df = pd.DataFrame({
        'Sector': list(rankings_variation.keys()),
        'Variation (Weighted vs Simple Avg)': list(rankings_variation.values()),
        'Variance of YoY Growth': list(rankings_variance.values()),
        'Covariance of 6M Beta Growth': list(rankings_cov_6m_beta.values()),
        'Covariance of 5Y Beta Growth': list(rankings_cov_5y_beta.values())
    })

    # Save the rankings DataFrame to CSV
    rankings_df.to_csv(output_file, index=False)
    print(f"\nRankings saved to {output_file}")

    # Optionally, print the rankings (as before)
    print("\nRanking of sectors based on variation between simple and weighted averages:")
    for sector, value in sorted(rankings_variation.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on variance of YoY growth over time:")
    for sector, value in sorted(rankings_variance.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on covariance of 6M Beta Growth:")
    for sector, value in sorted(rankings_cov_6m_beta.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

    print("\nRanking of sectors based on covariance of 5Y Beta Growth:")
    for sector, value in sorted(rankings_cov_5y_beta.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {value}")

def main():
    input_dir = "merged_sector_data"
    get_sector_rankings(input_dir)

if __name__ == "__main__":
    main()

