In [None]:
import pandas as pd
from datetime import date
import plotly.express as px
from utilities import color_map

# S&P 500
- S&P 500 is a US-based stock market index that dates back to 1957. The index tracks the value of 503 US-registered companies. Although ~500 does not even scrap the surface of the total number of publicly traded companies in the US, the companies listed in the S&P 500 index account for roughly 80% of the total US market capitalization; hence how the performance of these companies has a significant affect on the US economy.
- Tickers are abbreviations that link back to a given company.

In [2]:
# url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# # Read the table from the Wikipedia page
# tables = pd.read_html(url, header=0)
# df = tables[0]

# # Add year column
# df['Year_Added'] = pd.to_datetime(df['Date added']).dt.year

# # Keep the columns that are needed and discard the rest
# df = df[['Symbol', 'Security', 'GICS Sector', 'Year_Added']]

# # Rename columns in accordance with preference
# df.columns = ['company_ticker', 'company_name', 'sector', 'year_added']
# df.sample(5)

df = pd.read_csv('snp_market_cap.csv')
df.sample(5)

Unnamed: 0,Ticker,Sector,MarketCap,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,Year_Added
501,ZBH,Healthcare,18277200000.0,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927,2001
39,AAPL,Technology,3045708000000.0,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977,1982
345,NRG,Utilities,30487830000.0,NRG Energy,Utilities,Independent Power Producers & Energy Traders,"Houston, Texas",2010-01-29,1013871,1992,2010
370,PCG,Utilities,34371720000.0,PG&E Corporation,Utilities,Multi-Utilities,"Oakland, California",2022-10-03,1004980,1905,2022
437,TEL,Technology,48826460000.0,TE Connectivity,Information Technology,Electronic Manufacturing Services,"Galway, Ireland",2011-10-17,1385157,2007,2011


- It would be interesting to find out which original companies (i.e., added to the index at its inception back in 1957) are still in the index today. It indicates that these companies were able to withstand the volatility of the market and strategically evolve with the changing environment -- they are stable companies. In addition to which companies, it would be interesting to see which sector these companies belong to.

In [3]:
# Filter only companies that were added in 1957
stocks_added_1957 = df[df['Year_Added'] == 1957] # 53 stocks added in 1957 and are still in the index
# Create a dataframe of counts of original stocks to plot
stocks_1957_counts = pd.DataFrame(stocks_added_1957.Sector.value_counts()).reset_index()
stocks_1957_counts.columns = ['Sector', 'Count']
stocks_1957_counts

Unnamed: 0,Sector,Count
0,Industrials,14
1,Consumer Defensive,11
2,Utilities,10
3,Energy,6
4,Healthcare,5
5,Consumer Cyclical,2
6,Financial Services,2
7,Technology,2
8,Basic Materials,1


In [None]:
# Create a pie chart of the original compannies, categorized by sector
fig_1957 = px.pie(stocks_1957_counts, 
                  values='Count', 
                  names='Sector', 
                  color ='Sector',
                  color_discrete_map=color_map, 
                  title='S&P 500 Original Stocks Added in 1957 By Sector (2025)',
                  hole=0.3)
fig_1957.update_traces(textposition='outside', textinfo='percent+label')
fig_1957.update_layout(legend_title_text='Sector')
fig_1957.show()

# # Create html file
# fig_1957.write_html('original_stocks_1957.html')

- It would be interesting to see how the sectors are divided in the index.

In [139]:
# Create df to plot sector composition
sector_composition = pd.DataFrame(df['Sector'].value_counts()).reset_index()
sector_composition.columns = ['Sector', 'Count']

fig_sector = px.pie(
    sector_composition, 
    values='Count',
    names='Sector',
    color='Sector',
    color_discrete_map=color_map,
    title='Sector Composition of Current S&P 500 (2025)',
    hole=0.3
)
fig_sector.update_traces(textposition='outside', textinfo='percent+label')
fig_sector.update_layout(legend_title_text='Sector')
fig_sector.show()

- It would be interesting to see how many companies are added each year per sector.

In [None]:
# entrants_df = df['Year_Added'].value_counts().reset_index(name='Count')
new_entrants_df = df.groupby(['Year_Added', 'Sector']).size().reset_index(name='Count')

fig_new_entrants = px.bar(
    new_entrants_df[~(new_entrants_df['Year_Added'] == 1957)],
    x='Year_Added',
    y='Count',
    color='Sector',
    barmode='stack',
    color_discrete_map=color_map,
    # category_orders={"Year_Added": sorted(new_entrants_df["Year_Added"].unique())}, # ensure that year is chronological
    title='Annual Additions to S&P 500 by Sector')


fig_new_entrants.update_layout(
    xaxis_title='Year',
    yaxis_title='Count',
    # xaxis=dict(type='category')
)
fig_new_entrants.show()

- Now it will be cool to see what sector has the largest market cap

In [None]:
from utilities import download_batches_market_cap

In [24]:
snp500_market_cap = download_batches_market_cap(ticker_list=df['company_ticker'], batch_size=100)

168it [01:34,  1.78it/s]                        


- Sector names and company counts per sector do not match between Wikipedia and Yahoo Finance. Using Yahoo Finance's categorization because it's more current.

In [None]:
# Create a dictionary with ticker as keys and sectors as values, using info from yfinance
sector_map = snp500_market_cap.set_index('Ticker')['Sector'].to_dict()

# Apply to wikipedia df
df.loc[:,'Sector'] = snp500_market_cap['Ticker'].map(sector_map)

- I'm curious to see what sector categorizations do not match between Wikipedia and Yahoo.

In [None]:
# Create a new column, Match, with boolean to filter the sectors that differ between sources later
df.loc[:,'Match'] = df['sector'] == df['Sector']
df

Unnamed: 0,company_ticker,company_name,sector,year_added,Sector,Match
0,MMM,3M,Industrials,1957,Industrials,True
1,AOS,A. O. Smith,Industrials,2017,Industrials,True
2,ABT,Abbott Laboratories,Health Care,1957,Healthcare,False
3,ABBV,AbbVie,Health Care,2012,Healthcare,False
4,ACN,Accenture,Information Technology,2011,Technology,False
...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,2011,Industrials,True
499,YUM,Yum! Brands,Consumer Discretionary,1997,Consumer Cyclical,False
500,ZBRA,Zebra Technologies,Information Technology,2019,Technology,False
501,ZBH,Zimmer Biomet,Health Care,2001,Healthcare,False


In [None]:
df[~df['Match']]  # only rows where columns differ

- Merge Wikipedia data with market capitalization info from Yahoo

In [55]:
# Merge Wikipedia and yfinance market cap dfs to discover more things about new entrants

# df = df.rename(columns={"company_ticker": "Ticker"})
merged_df = pd.merge(left=snp500_market_cap, right=df, on=['Ticker', 'Sector'])
merged_df.head(2)

Unnamed: 0,Ticker,Sector,MarketCap,company_name,sector,year_added,Match
0,MMM,Industrials,78714490000.0,3M,Industrials,1957,True
1,AOS,Industrials,9143292000.0,A. O. Smith,Industrials,2017,True


In [None]:
# See which sectors are adding the most market cap to the index

grouped = merged_df.groupby(['Sector'])['MarketCap'].sum().reset_index()
grouped['MarketCap'] = grouped['MarketCap'] / 1e9  # Convert to billions
grouped.sort_values(by='MarketCap', ascending=False, inplace=True)

# Create bar chart
fig = px.bar(
    grouped,
    x='Sector',
    y='MarketCap',
    color='Sector',
    color_discrete_map=color_map,
    title='S&P 500 Market Capitalization by Sector (June 2025)',
    labels={"MarketCap": "Market Cap (Billion USD)", "Sector": "Sector"}
)

fig.show()