In [1]:
import datetime as dt
import pandas as pd
import numpy as np
from datetime import datetime


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
data = pd.read_html(url)

In [3]:
sp500 = data[0].iloc[:, [0,1,6,7]]
sp500.columns = ['ticker', 'name', 'date' , 'cik']

# Get rows where date is missing or not formatted correctly.
mask = sp500['date'].str.strip().str.fullmatch('\d{4}-\d{2}-\d{2}')
mask.loc[mask.isnull()] = False
mask = mask == False
sp500[mask].head()

Unnamed: 0,ticker,name,date,cik
29,AMD,AMD,,2488
51,T,AT&T,1983-11-30 (1957-03-04),732717
126,ED,Con Edison,,1047862
131,GLW,Corning,,24741
140,DHR,Danaher,,313616


In [4]:
current = sp500.copy()
current.loc[mask, 'date'] = '1900-01-01'
current.loc[:, 'date'] = pd.to_datetime(current['date'])
current.loc[:, 'cik'] = current['cik'].apply(str).str.zfill(10)

In [5]:
# Get the adjustments dataframe and rename columns
adjustments = data[1]
columns = ['date', 'ticker_added','name_added', 'ticker_removed', 'name_removed', 'reason']
adjustments.columns = columns

# Create additions dataframe.
additions = adjustments[~adjustments['ticker_added'].isnull()][['date','ticker_added', 'name_added']]
additions.columns = ['date','ticker','name']
additions['action'] = 'added'

# Create removals dataframe.
removals = adjustments[~adjustments['ticker_removed'].isnull()][['date','ticker_removed','name_removed']]
removals.columns = ['date','ticker','name']
removals['action'] = 'removed'

# Merge the additions and removals into one dataframe.
historical = pd.concat([additions, removals])
historical.head()

Unnamed: 0,date,ticker,name,action
0,"April 4, 2022",CPT,Camden,added
1,"March 2, 2022",MOH,Molina Healthcare,added
2,"February 15, 2022",NDSN,Nordson,added
4,"February 2, 2022",CEG,Constellation Energy,added
5,"December 20, 2021",SBNY,Signature Bank,added


In [10]:
"""
missing = current[~current['ticker'].isin(historical['ticker'])].copy()
missing['action'] = 'added'
missing = missing[['date','ticker','name','action', 'cik']]
missing.loc[:, 'cik'] = current['cik'].apply(str).str.zfill(10)
missing.head()
"""

"\nmissing = current[~current['ticker'].isin(historical['ticker'])].copy()\nmissing['action'] = 'added'\nmissing = missing[['date','ticker','name','action', 'cik']]\nmissing.loc[:, 'cik'] = current['cik'].apply(str).str.zfill(10)\nmissing.head()\n"

In [11]:
#sp500_history = pd.concat([historical, missing])
sp500_history = historical
sp500_history = sp500_history.sort_values(by=['date','ticker'], ascending=[False, True])
sp500_history = sp500_history.drop_duplicates(subset=['date','ticker'])
sp500_history

Unnamed: 0,date,ticker,name,action
121,"September 8, 2016",CHTR,Charter Communications,added
121,"September 8, 2016",EMC,EMC Corporation,removed
122,"September 6, 2016",MTD,Mettler Toledo,added
122,"September 6, 2016",TYC,Tyco International,removed
217,"September 5, 2012",LYB,LyondellBasell,added
...,...,...,...,...
134,"April 18, 2016",ULTA,Ulta Beauty,added
42,"April 1, 2020",ARNC,Arconic,removed
42,"April 1, 2020",HWM,Howmet Aerospace,added
241,"April 1, 2011",BLK,BlackRock,added


In [12]:
starting = list(sp500['ticker'].values)

In [13]:
sp500_history

Unnamed: 0,date,ticker,name,action
121,"September 8, 2016",CHTR,Charter Communications,added
121,"September 8, 2016",EMC,EMC Corporation,removed
122,"September 6, 2016",MTD,Mettler Toledo,added
122,"September 6, 2016",TYC,Tyco International,removed
217,"September 5, 2012",LYB,LyondellBasell,added
...,...,...,...,...
134,"April 18, 2016",ULTA,Ulta Beauty,added
42,"April 1, 2020",ARNC,Arconic,removed
42,"April 1, 2020",HWM,Howmet Aerospace,added
241,"April 1, 2011",BLK,BlackRock,added


In [14]:
type(sp500_history[['date']].values[-1][0])

str

In [15]:
def format_date(x):
    if(type(x)==pd._libs.tslibs.timestamps.Timestamp):
        return(x.strftime('%Y-%m-%d'))
    else:
        return(datetime.strptime(x, '%B %d, %Y'))
    #return datetime.strptime(x, '%B %d, %Y')

#sp500_history['date'].transform(format_date)

sp500_history_sorted = sp500_history.set_index(sp500_history['date'].transform(format_date)).sort_index(ascending=False)


In [16]:
sp500_history_sorted

Unnamed: 0_level_0,date,ticker,name,action
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-04-04,"April 4, 2022",CPT,Camden,added
2022-04-04,"April 4, 2022",PBCT,People's United Financial,removed
2022-03-02,"March 2, 2022",MOH,Molina Healthcare,added
2022-03-02,"March 2, 2022",INFO,IHS Markit,removed
2022-02-15,"February 15, 2022",NDSN,Nordson,added
...,...,...,...,...
2000-06-07,"June 7, 2000",SBUX,Starbucks,added
1999-12-07,"December 7, 1999",YHOO,Yahoo!,added
1999-12-07,"December 7, 1999",LDW,Laidlaw,removed
1997-06-17,"June 17, 1997",CCI,Countrywide Credit Industries,added


In [17]:
sp500_history_sorted['ticker'][0]

'CPT'

In [18]:
starting = list(sp500['ticker'].values)

In [19]:
sp500_history_sorted

Unnamed: 0_level_0,date,ticker,name,action
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-04-04,"April 4, 2022",CPT,Camden,added
2022-04-04,"April 4, 2022",PBCT,People's United Financial,removed
2022-03-02,"March 2, 2022",MOH,Molina Healthcare,added
2022-03-02,"March 2, 2022",INFO,IHS Markit,removed
2022-02-15,"February 15, 2022",NDSN,Nordson,added
...,...,...,...,...
2000-06-07,"June 7, 2000",SBUX,Starbucks,added
1999-12-07,"December 7, 1999",YHOO,Yahoo!,added
1999-12-07,"December 7, 1999",LDW,Laidlaw,removed
1997-06-17,"June 17, 1997",CCI,Countrywide Credit Industries,added


In [22]:
starting

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ABMD',
 'ACN',
 'ATVI',
 'ADM',
 'ADBE',
 'ADP',
 'AAP',
 'AES',
 'AFL',
 'A',
 'AIG',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AMD',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'ANTM',
 'AON',
 'APA',
 'AAPL',
 'AMAT',
 'APTV',
 'ANET',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'AZO',
 'AVB',
 'AVY',
 'BKR',
 'BALL',
 'BAC',
 'BBWI',
 'BAX',
 'BDX',
 'WRB',
 'BRK.B',
 'BBY',
 'BIO',
 'BIIB',
 'BLK',
 'BK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BF.B',
 'CHRW',
 'CDNS',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CTLT',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'CNC',
 'CNP',
 'CERN',
 'CF',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CTXS',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA',
 'CAG',
 'COP',
 'ED',
 'STZ

In [26]:
np.sort(list(sp500['ticker'].values))

array(['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABMD', 'ABT', 'ACN',
       'ADBE', 'ADI', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP', 'AES', 'AFL',
       'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE',
       'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT', 'AMZN', 'ANET',
       'ANSS', 'ANTM', 'AON', 'AOS', 'APA', 'APD', 'APH', 'APTV', 'ARE',
       'ATO', 'ATVI', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXP', 'AZO', 'BA',
       'BAC', 'BALL', 'BAX', 'BBWI', 'BBY', 'BDX', 'BEN', 'BF.B', 'BIIB',
       'BIO', 'BK', 'BKNG', 'BKR', 'BLK', 'BMY', 'BR', 'BRK.B', 'BRO',
       'BSX', 'BWA', 'BXP', 'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB',
       'CBOE', 'CBRE', 'CCI', 'CCL', 'CDAY', 'CDNS', 'CDW', 'CE', 'CEG',
       'CERN', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CL',
       'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP',
       'COF', 'COO', 'COP', 'COST', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM',
       'CSCO', 'CSX', 'CTAS', 'CTLT', 'CTRA', 'CTSH', 'CTVA', 'C

In [24]:
#np.where(np.array(list(sp500['ticker'].values))=='Q')

(array([], dtype=int64),)

(array([], dtype=int64),)

In [23]:
starting = list(sp500['ticker'].values)

#missing Q

for d in (np.unique(sp500_history_sorted.index)[::-1]):
    #print(d)
    d_subset = sp500_history_sorted.loc[[d]].sort_values(by='action',ascending=True)
    print(d_subset)
    for d_ in range(0,len(d_subset)):
        d_data = d_subset.iloc[[d_]]

        ticker = d_data['ticker'][0]
        print(ticker)
        #print(starting)
        if(d_data['action'][0]=='added'):        
            #print("removing",ticker)
            starting.pop(np.where(np.array(starting)==ticker)[0][0])              
            
        else:
            #print(starting)
            #print("adding", ticker)
            starting.extend([ticker])
    print(len(starting))
    #print(starting)
sp500['ticker'].values

                     date ticker                       name   action
date                                                                
2022-04-04  April 4, 2022    CPT                     Camden    added
2022-04-04  April 4, 2022   PBCT  People's United Financial  removed
CPT
PBCT
504
                     date ticker               name   action
date                                                        
2022-03-02  March 2, 2022    MOH  Molina Healthcare    added
2022-03-02  March 2, 2022   INFO         IHS Markit  removed
MOH
INFO
504
                         date ticker     name   action
date                                                  
2022-02-15  February 15, 2022   NDSN  Nordson    added
2022-02-15  February 15, 2022   XLNX   Xilinx  removed
NDSN
XLNX
504
                        date ticker name   action
date                                             
2022-02-03  February 3, 2022    GPS  Gap  removed
GPS
505
                        date ticker                  name acti

IndexError: index 0 is out of bounds for axis 0 with size 0