In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import statistics
from math import isnan

In [14]:
sp500_history1 = pd.read_csv('./sp500_history_wiki.csv')
sp500_history1['date'] = pd.to_datetime(sp500_history1['date']) 
sp500_history1 = sp500_history1[['date', 'value', 'name', 'variable']]

start_date = datetime.strptime('2016-1-1', '%Y-%m-%d')
end_date = datetime.strptime('2020-12-31', '%Y-%m-%d')

# greater than the start date and smaller than the end date
mask = (sp500_history1['date'] > start_date) & (sp500_history1['date'] <= end_date)
sp500_history1 = sp500_history1.loc[mask]

# Sort
sp500_history1.sort_values(['date', 'variable'], ascending=[True, True])

# Rename columns
sp500_history1.columns = ['effective_date', 'ticker', 'wiki_name', 'type']

sp500_history1 = sp500_history1.drop_duplicates()
sp500_history1 = sp500_history1.reset_index(drop=True)
sp500_history1 = sp500_history1.replace(to_replace = 'added_ticker', value = 'ADDED')
sp500_history1 = sp500_history1.replace(to_replace = 'removed_ticker', value = 'DELETED')
sp500_history1

Unnamed: 0,effective_date,ticker,wiki_name,type
0,2016-01-05,WLTW,Willis Towers Watson,ADDED
1,2016-01-05,FOSL,Fossil Group,DELETED
2,2016-01-19,EXR,Extra Space Storage,ADDED
3,2016-01-19,CB,Chubb Corp,DELETED
4,2016-02-01,CFG,Citizens Financial Group,ADDED
...,...,...,...,...
200,2020-04-03,OTIS,Otis Worldwide,ADDED
201,2020-04-06,RTN,Raytheon Company,DELETED
202,2020-04-06,M,"Macy's, Inc.",DELETED
203,2020-05-12,DXCM,Dexcom,ADDED


In [21]:
sp500_history2 = pd.read_excel('./sp500_rebalance_announcements.xlsx')
sp500_history2['implementation_date'] = pd.to_datetime(sp500_history2['implementation_date']) 
sp500_history2['effective_date'] = pd.to_datetime(sp500_history2['effective_date']) 
sp500_history2['announcement_date'] = pd.to_datetime(sp500_history2['announcement_date']) 
sp500_history2 = sp500_history2[['announcement_date', 'implementation_date', 'effective_date', 'ticker', 'name', 'type']]

start_date = datetime.strptime('2015-12-20', '%Y-%m-%d')
end_date = datetime.strptime('2020-12-31', '%Y-%m-%d')

# greater than the start date and smaller than the end date
mask = (sp500_history2['announcement_date'] > start_date) & (sp500_history2['announcement_date'] <= end_date)
sp500_history2 = sp500_history2.loc[mask]

# Sort
sp500_history2 = sp500_history2.sort_values(['announcement_date', 'type'], ascending=[True, True])
sp500_history2 = sp500_history2.drop_duplicates()
sp500_history2 = sp500_history2.reset_index(drop=True)
sp500_history2

Unnamed: 0,announcement_date,implementation_date,effective_date,ticker,name,type
0,2015-12-22,2015-12-28,NaT,CHD,Church & Dwight,ADDED
1,2015-12-22,2015-12-28,NaT,ALTR,Altera,DELETED
2,2015-12-28,2016-01-04,NaT,TW,Willis Towers Watson,ADDED
3,2015-12-28,2016-01-04,NaT,FOSL,Fossil Group,DELETED
4,2016-01-13,2016-01-15,NaT,EXR,Extra Space Storage,ADDED
...,...,...,...,...,...,...
205,2020-05-06,NaT,2020-05-12,DPZ,Domino’s Pizza,ADDED
206,2020-05-06,NaT,2020-05-12,AGN,Allergan,DELETED
207,2020-05-06,NaT,2020-05-12,CPRI,Capri Holdings,DELETED
208,2020-05-18,NaT,2020-05-22,WST,West Pharmaceutical Services,ADDED


In [22]:
outer_join = sp500_history1.merge(sp500_history2, how = 'outer', on=['ticker', 'type'], indicator=True)

In [23]:
outer_join.sort_values(['ticker', 'type'], inplace = True)

In [25]:
outer_join.to_excel('differences_between_wiki_and_announcement_data.xlsx')

In [26]:
# find entries where wiki dates are wrong
mask = (outer_join['_merge']=='both') & (-pd.Series.isna(outer_join['effective_date_y'])) & (outer_join['effective_date_x'] != outer_join['effective_date_y'])
outer_join_masked = outer_join.loc[mask]

In [27]:
outer_join_masked

Unnamed: 0,effective_date_x,ticker,wiki_name,type,announcement_date,implementation_date,effective_date_y,name,_merge
100,2017-08-08,BHF,Brighthouse Financial Inc,ADDED,2017-07-31,NaT,2017-08-07,Brighthouse Financial,both
163,2019-04-02,BHF,Brighthouse Financial,DELETED,2019-03-26,NaT,2019-04-03,Brighthouse Financial,both
166,2019-06-03,FLR,Fluor Corporation,DELETED,2019-05-30,NaT,2019-06-04,Fluor,both
137,2018-09-14,WCG,WellCare,ADDED,2018-09-11,NaT,2018-09-17,WellCare Health Plans,both
138,2018-09-14,XL,XL Group,DELETED,2018-09-11,NaT,2018-09-17,XL Group,both
