In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from pandas_datareader import data as pdr
yf.pdr_override() # <== that's all it takes :-)
import numpy as np
import statistics

sp500_history = pd.read_csv('./sp500_history.csv')
sp500_history['date'] = pd.to_datetime(sp500_history['date']) 
sp500_history = sp500_history[['date', 'value', 'variable']]

start_date = datetime.strptime('2016-1-1', '%Y-%m-%d')
end_date = datetime.strptime('2020-12-31', '%Y-%m-%d')

# greater than the start date and smaller than the end date
mask = (sp500_history['date'] > start_date) & (sp500_history['date'] <= end_date)
sp500_history = sp500_history.loc[mask]

# Sort
sp500_history.sort_values(['date', 'variable'], ascending=[True, True])

  from pandas.util.testing import assert_frame_equal


Unnamed: 0,date,value,variable
261,2016-01-05,WLTW,added_ticker
260,2016-01-05,FOSL,removed_ticker
262,2016-01-19,EXR,added_ticker
263,2016-01-19,CB,removed_ticker
264,2016-02-01,FRT,added_ticker
...,...,...,...
453,2019-12-23,ZBRA,added_ticker
454,2019-12-23,STE,added_ticker
449,2019-12-23,AMG,removed_ticker
451,2019-12-23,MAC,removed_ticker


In [2]:
calendar = pd.read_excel('./Nasdaq_Trading_Calendar.xlsx', sheet_name=None)
rebal_dates = []
for year in calendar:
    sheet = calendar[year]
    mask = sheet['S&P Indexes Rebalance S&P 500, S&P 400, and S&P 600'] == 1
    year_rebal_dates = sheet.loc[mask]
    for date in year_rebal_dates['Date'].values:
        rebal_dates.append(date)


In [3]:
rebal_dates

[numpy.datetime64('2020-03-20T00:00:00.000000000'),
 numpy.datetime64('2020-06-19T00:00:00.000000000'),
 numpy.datetime64('2020-09-18T00:00:00.000000000'),
 numpy.datetime64('2020-12-18T00:00:00.000000000'),
 numpy.datetime64('2019-03-15T00:00:00.000000000'),
 numpy.datetime64('2019-06-21T00:00:00.000000000'),
 numpy.datetime64('2019-09-20T00:00:00.000000000'),
 numpy.datetime64('2019-12-20T00:00:00.000000000'),
 numpy.datetime64('2018-03-16T00:00:00.000000000'),
 numpy.datetime64('2018-06-15T00:00:00.000000000'),
 numpy.datetime64('2018-09-21T00:00:00.000000000'),
 numpy.datetime64('2018-12-21T00:00:00.000000000'),
 numpy.datetime64('2017-03-17T00:00:00.000000000'),
 numpy.datetime64('2017-06-16T00:00:00.000000000'),
 numpy.datetime64('2017-09-15T00:00:00.000000000'),
 numpy.datetime64('2017-12-15T00:00:00.000000000'),
 numpy.datetime64('2016-03-18T00:00:00.000000000'),
 numpy.datetime64('2016-06-17T00:00:00.000000000'),
 numpy.datetime64('2016-09-16T00:00:00.000000000'),
 numpy.datet

In [8]:
columns_to_create = ['original_date', 'total', 'mean', 'std', 'up', 'count', 'max', 'min', 'sharpe', 'uppct']

# output = pd.DataFrame(columns=columns_to_create)
# output.set_index(['date', 'add/delete'])

# output_add = pd.DataFrame(columns=columns_to_create)
# output_delete = pd.DataFrame(columns=columns_to_create)
regular_add = pd.DataFrame(columns=columns_to_create)
regular_delete = pd.DataFrame(columns=columns_to_create)
ad_hoc_add = pd.DataFrame(columns=columns_to_create)
ad_hoc_delete = pd.DataFrame(columns=columns_to_create)

# output_add['date'] = pd.to_datetime(output_add['date'])
# output_add.set_index('date')
# output_delete['date'] = pd.to_datetime(output_delete['date'])
# output_delete.set_index('date')

In [9]:
MAX_DATE_EXTENSION = 10
days_bef_aft = 10 # the range of days before and after effective date to pull from Yahoo finance
last_date = sp500_history.iloc[0, sp500_history.columns.get_loc('date')]
last_type = sp500_history.iloc[0, sp500_history.columns.get_loc('variable')]
last_changes = []
is_begin = True
is_regular = False

for idx, row in sp500_history.iterrows():
    
    # End date is exclusive, so need to increase by 1
    prices = pdr.get_data_yahoo(row['value'], start=row['date'] - timedelta(days=days_bef_aft), end=row['date'] + timedelta(days=days_bef_aft + 1))
    if len(prices) == 0:
        continue
        
    # Find the adjusted effective date 
    # which is one trading day before the effective (market opening) date
    adj_date = row['date'] - timedelta(days=1)
    date_extension = 0
    while not adj_date in prices.index and date_extension < MAX_DATE_EXTENSION:
        adj_date -= timedelta(days=1)
        date_extension += 1
    if date_extension >= MAX_DATE_EXTENSION:
        continue
    adj_date_close = prices.loc[adj_date]['Close']
    
    if adj_date in rebal_dates:
        is_regular = True
    else:
        is_regular = False
        
    # Find the entry date, which is one trading day before the adjusted effective date
    prev_date = adj_date - timedelta(days=1)
    date_extension = 0
    while not prev_date in prices.index and date_extension < MAX_DATE_EXTENSION:
        prev_date -= timedelta(days=1)
        date_extension += 1
    if date_extension >= MAX_DATE_EXTENSION:
        continue
    prev_close = prices.loc[prev_date]['Close']

    # Find the next opening price
#     next_date = row['date']
#     date_extension = 0
#     while not next_date in prices.index and date_extension < MAX_DATE_EXTENSION:
#         next_date += timedelta(days=1)
#         date_extension += 1
#     if date_extension >= MAX_DATE_EXTENSION:
#         continue    
#     next_open = prices.loc[next_date]['Open']

    price_change = adj_date_close - prev_close

    # Calculate mean and all
    if not is_begin and (last_date != adj_date or last_type != row['variable']):
        print(last_changes)
        count = len(last_changes)
        up = len(list(filter(lambda x: (x >= 0), last_changes))) 
        
        data = [up, count]

        if last_type == 'added_ticker':
            if is_regular:
                regular_add.loc[last_date, 'up':'count'] = data
            else:
                ad_hoc_add.loc[last_date, 'up':'count'] = data

        else:
            if is_regular:
                regular_delete.loc[last_date, 'up':'count'] = data
            else:
                ad_hoc_delete.loc[last_date, 'up':'count'] = data

        total = 0
        last_changes = []
    
    # Still in the same date
    else:
        if is_begin:
            total = 0
            is_begin = False
        else:
            total = last_row['total']
    
    total += price_change
    
    data = {'total': total, 'original_date': row['date']}
            
    if row['variable'] == 'added_ticker':
        if is_regular:
            if adj_date not in regular_add.index:
                regular_add = regular_add.append(pd.DataFrame(data, index = [pd.to_datetime(adj_date)]), ignore_index=False)
            else:
                regular_add.loc[adj_date, 'total'] = total
            
        else:
            if adj_date not in ad_hoc_add.index:
                ad_hoc_add = ad_hoc_add.append(pd.DataFrame(data, index = [pd.to_datetime(adj_date)]), ignore_index=False)
            else:
                ad_hoc_add.loc[adj_date, 'total'] = total
                
    else:
        if is_regular:
            if adj_date not in regular_delete.index:
                regular_delete = regular_delete.append(pd.DataFrame(data, index = [pd.to_datetime(adj_date)]), ignore_index=False)
            else:
                regular_delete.loc[adj_date, 'total'] = total
            
        else:
            if adj_date not in ad_hoc_delete.index:
                ad_hoc_delete = ad_hoc_delete.append(pd.DataFrame(data, index = [pd.to_datetime(adj_date)]), ignore_index=False)
            else:
                ad_hoc_delete.loc[adj_date, 'total'] = total
                
    last_date = adj_date
    last_type = row['variable']
    last_changes.append(price_change)
    
    if last_type == 'added_ticker':
        if is_regular:
            last_row = regular_add.loc[last_date, :]
        else:
            last_row = ad_hoc_add.loc[last_date, :]
    
    else:
        if is_regular:
            last_row = regular_delete.loc[last_date, :]
        else:
            last_row = ad_hoc_delete.loc[last_date, :]
        
regular_add.index.names = ['date']
ad_hoc_add.index.names = ['date']
regular_delete.index.names = ['date']
ad_hoc_delete.index.names = ['date']



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[-2.270000457763672]
[*********************100%***********************]  1 of 1 completed
[-3.6821136474609375]
[*********************100%***********************]  1 of 1 completed
[-1.19000244140625]
[*********************100%***********************]  1 of 1 completed
[0.0]
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- BRCM: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- PCP: Data doesn't exist for startDate = 1453392000, endDate = 1455206400
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[3.0, -0.48999977111816406]
[*********************100%***********************]  1 of 1 completed
[*********************100%******

[*********************100%***********************]  1 of 1 completed
[0.1564006805419922]
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- MJN: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[0.7600021362304688]
[*********************100%***********************]  1 of 1 completed
[-0.589996337890625]
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- YHOO: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[3.5, 1.239990234375, 0.6699981689453125, -0.43999481201171875]
[*********************100%***********************]  1 of 1 completed

1 Failed do

[*********************100%***********************]  1 of 1 completed
[1.8699989318847656]
[*********************100%***********************]  1 of 1 completed
[1.9500007629394531]
[*********************100%***********************]  1 of 1 completed
[-1.8299999237060547]
[*********************100%***********************]  1 of 1 completed
[-1.0400009155273438]
[*********************100%***********************]  1 of 1 completed
[-0.4099998474121094]
[*********************100%***********************]  1 of 1 completed
[-0.19999980926513672]
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- LLL: No data found, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[5.08001708984375]
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- RHT: No data found, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[1.1899948120117188]
[********

In [22]:
'''
Calculate population statistics
for regular add
'''
regular_add_sum = regular_add.sum(axis=0)
regular_add_sum['mean'] = regular_add_sum['total'] / regular_add_sum['count']
if regular_add_sum['count'] > 1:
    regular_add_sum['std'] = statistics.stdev(regular_add['total'])
else:
    regular_add_sum['std'] = 0
regular_add_sum['max'] = max(regular_add['total'])
regular_add_sum['min'] = max(regular_add['total'])
if regular_add_sum['std'] == 0:
    regular_add_sum['sharpe'] = regular_add_sum['mean'] / 0.00001 * np.sqrt(252)
else:
    regular_add_sum['sharpe'] = regular_add_sum['mean'] / regular_add_sum['std'] * np.sqrt(252)
regular_add_sum['uppct'] = regular_add_sum['up'] / regular_add_sum['count']

# data = {'mean': [mean]}
# regular_add = regular_add.append(pd.DataFrame(data), ignore_index=True)


In [23]:
regular_add_sum

total      5.939976
mean       0.349410
std             NaN
up        11.000000
count     17.000000
max             NaN
min             NaN
sharpe          NaN
uppct      0.647059
dtype: float64

In [25]:
statistics.stdev(regular_add['total'])

nan

In [26]:
regular_add['total']

0          NaN
1     0.229992
2          NaN
3     4.969994
4          NaN
5     0.250000
6          NaN
7     3.409996
8          NaN
9    -1.169998
10   -1.820000
11         NaN
12   -0.740005
13    0.809998
14         NaN
Name: total, dtype: float64

In [29]:
writer = pd.ExcelWriter('sp500_regular_add.xlsx', engine='xlsxwriter')
regular_add.to_excel(writer, sheet_name='Data')
regular_add_sum.to_excel(writer, sheet_name='Population Statistics')
writer.save()
# regular_add.to_csv('sp500_regular_add.csv')
# ad_hoc_add.to_csv('sp500_ad_hoc_add.csv')
# regular_delete.to_csv('sp500_regular_delete.csv')
# ad_hoc_delete.to_csv('sp500_ad_hoc_delete.csv')