In [7]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from pandas_datareader import data as pdr
yf.pdr_override() # <== that's all it takes :-)
import numpy as np
import statistics
from pprint import pprint
from math import isnan

sp500_history = pd.read_csv('./sp500_history.csv')
sp500_history['date'] = pd.to_datetime(sp500_history['date']) 
sp500_history = sp500_history[['date', 'value', 'variable']]

start_date = datetime.strptime('2016-1-1', '%Y-%m-%d')
end_date = datetime.strptime('2020-12-31', '%Y-%m-%d')

# greater than the start date and smaller than the end date
mask = (sp500_history['date'] > start_date) & (sp500_history['date'] <= end_date)
sp500_history = sp500_history.loc[mask]

# Sort
sp500_history.sort_values(['date', 'variable'], ascending=[True, True])

Unnamed: 0,date,value,variable
261,2016-01-05,WLTW,added_ticker
260,2016-01-05,FOSL,removed_ticker
262,2016-01-19,EXR,added_ticker
263,2016-01-19,CB,removed_ticker
264,2016-02-01,FRT,added_ticker
...,...,...,...
453,2019-12-23,ZBRA,added_ticker
454,2019-12-23,STE,added_ticker
449,2019-12-23,AMG,removed_ticker
451,2019-12-23,MAC,removed_ticker


In [21]:
sp500_history

Unnamed: 0,date,value,variable
260,2016-01-05,FOSL,removed_ticker
261,2016-01-05,WLTW,added_ticker
262,2016-01-19,EXR,added_ticker
263,2016-01-19,CB,removed_ticker
264,2016-02-01,FRT,added_ticker
...,...,...,...
450,2019-12-23,LYV,added_ticker
451,2019-12-23,MAC,removed_ticker
452,2019-12-23,TRIP,removed_ticker
453,2019-12-23,ZBRA,added_ticker


In [8]:
calendar = pd.read_excel('./Nasdaq_Trading_Calendar.xlsx', sheet_name=None)
rebal_dates = []
for year in calendar:
    sheet = calendar[year]
    mask = sheet['S&P Indexes Rebalance S&P 500, S&P 400, and S&P 600'] == 1
    year_rebal_dates = sheet.loc[mask]
    for date in year_rebal_dates['Date'].values:
        rebal_dates.append(date)
pprint(rebal_dates)

[numpy.datetime64('2020-03-20T00:00:00.000000000'),
 numpy.datetime64('2020-06-19T00:00:00.000000000'),
 numpy.datetime64('2020-09-18T00:00:00.000000000'),
 numpy.datetime64('2020-12-18T00:00:00.000000000'),
 numpy.datetime64('2019-03-15T00:00:00.000000000'),
 numpy.datetime64('2019-06-21T00:00:00.000000000'),
 numpy.datetime64('2019-09-20T00:00:00.000000000'),
 numpy.datetime64('2019-12-20T00:00:00.000000000'),
 numpy.datetime64('2018-03-16T00:00:00.000000000'),
 numpy.datetime64('2018-06-15T00:00:00.000000000'),
 numpy.datetime64('2018-09-21T00:00:00.000000000'),
 numpy.datetime64('2018-12-21T00:00:00.000000000'),
 numpy.datetime64('2017-03-17T00:00:00.000000000'),
 numpy.datetime64('2017-06-16T00:00:00.000000000'),
 numpy.datetime64('2017-09-15T00:00:00.000000000'),
 numpy.datetime64('2017-12-15T00:00:00.000000000'),
 numpy.datetime64('2016-03-18T00:00:00.000000000'),
 numpy.datetime64('2016-06-17T00:00:00.000000000'),
 numpy.datetime64('2016-09-16T00:00:00.000000000'),
 numpy.datet

In [29]:
# rebal_type is 'regular' or 'ad_hoc'
# add_delete is 'add' or 'delete'
# entry_date and exit_date are in terms of number of days before (-) or after (+) effective date
# entry_time and exit_time are 'Open' or 'Close'
strategy_attributes = ['rebal_type', 'add_delete', 'entry_date', 'entry_time', 'exit_date', 'exit_time']

output_columns = ['eff_date', 'original_date']
output_columns.extend(strategy_attributes)
output_columns.extend(['total', 'up', 'count'])

df_output = pd.DataFrame(columns=output_columns)

In [10]:
def update_output_table(df_output, eff_date, original_date, rebal_type, add_delete, 
                        entry_date, entry_time, exit_date, exit_time, price_change):
    if price_change > 0:
        up = 1
    else:
        up = 0

    conditions = ((df_output['eff_date']==eff_date) & 
                 (df_output['rebal_type']==rebal_type) & 
                 (df_output['add_delete']==add_delete) &
                (df_output['entry_date']==entry_date) & 
                (df_output['entry_time']==entry_time) & 
                (df_output['exit_date']==exit_date) & 
                (df_output['exit_time']==exit_time))

    # new row
    if not conditions.any():
        data = [{'eff_date': eff_date, 'rebal_type': rebal_type, 'add_delete': add_delete, 
                 'entry_date': entry_date, 'entry_time': entry_time, 
                 'exit_date': exit_date, 'exit_time': exit_time, 
                 'total': price_change, 'up': up, 'count': 1, 'original_date': original_date}]
        df_output = df_output.append(pd.DataFrame(data), ignore_index=True)

    # updating existing row
    else:
        df_output.loc[conditions, 'total'] += price_change
        df_output.loc[conditions, 'up'] += up
        df_output.loc[conditions, 'count'] += 1
    
    return df_output

In [11]:
def find_price_at_index(prices, eff_date_index, index_offset, time_of_day):
    '''
    Returns -1 if index not valid
    '''
    adj_index = eff_date_index + index_offset
    if adj_index < 0 or adj_index >= len(prices):
#         print("adjusted index out of range")
#         print(adj_index)
        return -1
    price = prices.iloc[adj_index][time_of_day]
    if isnan(price):
#         print("price at adjusted index invalid")
#         print(price)
        return -1
    return price

In [28]:
def find_price_change(prices, eff_date_index, entry_date, entry_time, exit_date, exit_time):
    '''
    Returns percentage change in price between entry and exit
    Or NaN if anything invalid
    '''
    entry_price = find_price_at_index(prices, eff_date_index, entry_date, entry_time)
    if entry_price < 0 or isnan(entry_price):
#         print("entry price invalid")
#         print(entry_price)
        return float('NaN')
    exit_price = find_price_at_index(prices, eff_date_index, exit_date, exit_time)
    if exit_price < 0 or isnan(exit_price):
#         print ("exit price invalid")
#         print(exit_price)
        return float('NaN')
    return exit_price / entry_price - 1 

In [30]:
days_bef_aft = 20 # the range of days before and after effective date to pull from Yahoo finance

exit_time = 'Close'

times_of_day = ['Open', 'Close']

for idx, row in sp500_history.iterrows():
    
    # End date is exclusive, so need to increase by 1
    prices = pdr.get_data_yahoo(row['value'], start=row['date'] - timedelta(days=days_bef_aft), end=row['date'] + timedelta(days=days_bef_aft + 1))
    if len(prices) == 0:
        continue
        
    # Find the adjusted effective date 
    # which is one trading day before the wikipedia effective (i.e. market opening) date
    try:
        eff_date_index = prices.index.get_loc(row['date'] - timedelta(days=1),method='pad')
    except:
        print("Error: No date before original date was found among yahoo prices")
        continue
    if eff_date_index < 0:
#         print('effective date not in range')
#         print(prices)
#         print(row['date'])
        continue
    eff_date = prices.iloc[eff_date_index].name
#     print("sanity check:")
#     print("eff date:")
#     print(eff_date)
#     print("original date:")
#     print(row['date'])
        
    if eff_date in rebal_dates:
        rebal_type = 'regular'
#         print("regular!")
#         print(prices)
#         print(eff_date)
    else:
        rebal_type = 'ad_hoc'

    if row['variable'] == 'added_ticker':
        add_delete = 'add'
#         print("add!")
    else:
#         print("delete!")
        add_delete = 'delete'
        
    '''
    Loop of different entry and exit dates
    '''
    entry_date = max(-4, -eff_date_index)
    exit_date = 0
    while entry_date < 0:
        for time_of_day in times_of_day:
            entry_time = time_of_day
            
            price_change = find_price_change(prices, eff_date_index, entry_date, entry_time, exit_date, exit_time)
            if isnan(price_change):
#                 print("price change is invalid")
#                 print(price_change)
                continue
            
#             if price_change == 0:
#                 print('WHY EQUAL')
#                 print(prices)
#                 print(entry_price)
#                 print(exit_price)
            
            df_output = update_output_table(df_output, eff_date, row['date'], rebal_type, add_delete, 
                        entry_date, entry_time, exit_date, exit_time, price_change)

        entry_date += 1
    
    entry_date = 0
    exit_date = min(4, len(prices) - 1)
    while exit_date > 0:
        for time_of_day in times_of_day:
            entry_time = time_of_day
            
            price_change = find_price_change(prices, eff_date_index, entry_date, entry_time, exit_date, exit_time)
            if isnan(price_change):
#                 print("price change is invalid")
#                 print(price_change)
                continue
            
            df_output = update_output_table(df_output, eff_date, row['date'], rebal_type, add_delete, 
                        entry_date, entry_time, exit_date, exit_time, price_change)

        exit_date -= 1

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2016-01-04 00:00:00
original date:
2016-01-05 00:00:00
delete!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2016-01-04 00:00:00
original date:
2016-01-05 00:00:00
add!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2016-01-15 00:00:00
original date:
2016-01-19 00:00:00
add!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2016-01-15 00:00:00
original date:
2016-01-19 00:00:00
delete!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2016-01-29 00:00:00
original date:
2016-02-01 00:00:00
add!
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- BRCM: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed

1 Fa

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2017-03-17 00:00:00
original date:
2017-03-20 00:00:00
regular!
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2017-02-27  78.500000  79.430000  78.040001  79.410004  76.171280    400600
2017-02-28  79.150002  79.199997  77.589996  78.559998  75.355934    679700
2017-03-01  79.809998  81.919998  79.809998  81.040001  77.734795    862200
2017-03-02  81.129997  81.129997  79.400002  79.510002  76.267204    522800
2017-03-03  79.470001  80.089996  78.940002  79.790001  76.535789    373500
2017-03-06  79.099998  79.660004  78.690002  79.169998  75.941063    457000
2017-03-07  79.129997  79.320000  78.440002  78.690002  75.480659    481800
2017-03-08  79.839996  80.099998  78.639999  78.669998  75.461449    426900
2017-03-09  79.059998  79.620003  78.760002  79.220001  75.989029    415400
2017-03

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2017-06-16 00:00:00
original date:
2017-06-19 00:00:00
regular!
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2017-05-30  125.820000  126.919998  125.730003  126.779999  126.779999   
2017-05-31  127.000000  127.730003  125.970001  126.330002  126.330002   
2017-06-01  126.919998  127.889999  125.440002  127.860001  127.860001   
2017-06-02  128.649994  129.529999  128.460007  129.490005  129.490005   
2017-06-05  129.199997  129.850006  128.850006  129.539993  129.539993   
2017-06-06  128.889999  129.619995  128.029999  128.399994  128.399994   
2017-06-07  128.929993  129.259995  127.739998  128.339996  128.339996   
2017-06-08  128.529999  128.750000  126.000000  126.959999  126.959999   
2017-06-09  127.300003  127.660004  122.889999  124.519997  124.519997   
2017-06-12  123.639999  124.1

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-03-16 00:00:00
original date:
2018-03-19 00:00:00
regular!
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2018-02-26  32.180000  32.360001  31.730000  32.349998  29.345791   1336900
2018-02-27  32.259998  32.590000  31.820000  31.840000  28.883154   1203100
2018-02-28  31.990000  32.299999  31.530001  31.580000  28.647299   1787900
2018-03-01  25.270000  25.830000  22.570000  24.110001  21.871006  15341400
2018-03-02  23.450001  25.170000  23.320000  25.010000  22.687426   8128000
2018-03-05  24.969999  25.500000  24.750000  25.070000  22.741856   3512100
2018-03-06  25.090000  25.110001  24.299999  24.650000  22.360857   2502100
2018-03-07  24.620001  25.400000  24.500000  25.139999  22.805353   2325300
2018-03-08  24.990000  25.150000  24.590000  24.920000  22.605785   1635400
2018-03

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-03-16 00:00:00
original date:
2018-03-19 00:00:00
regular!
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2018-02-26  249.259995  251.750000  246.589996  251.710007  251.710007   
2018-02-27  250.399994  254.330002  247.479996  247.669998  247.669998   
2018-02-28  248.860001  257.299988  248.860001  248.979996  248.979996   
2018-03-01  249.710007  254.869995  247.830002  250.179993  250.179993   
2018-03-02  247.490005  256.940002  244.850006  255.860001  255.860001   
2018-03-05  253.449997  261.859985  248.970001  260.119995  260.119995   
2018-03-06  261.649994  266.230011  257.149994  266.109985  266.109985   
2018-03-07  261.690002  267.070007  260.079987  266.429993  266.429993   
2018-03-08  267.769989  270.000000  261.260010  265.160004  265.160004   
2018-03-09  268.630005  271.7

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-06-06 00:00:00
original date:
2018-06-07 00:00:00
add!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-06-15 00:00:00
original date:
2018-06-18 00:00:00
regular!
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2018-05-29  14.980000  15.370000  14.740000  15.100000  14.829293   6019400
2018-05-30  15.230000  15.710000  15.060000  15.690000  15.408715   6386600
2018-05-31  15.580000  16.090000  15.470000  15.840000  15.556027  20335300
2018-06-01  15.840000  16.490000  15.790000  16.379999  16.086344  10602800
2018-06-04  16.379999  16.580000  15.550000  15.620000  15.339971   7505300
2018-06-05  15.600000  15.910000  15.380000  15.830000  15.546205   4524000
2018-06-06  15.970000  16.059999  15.620000  15.780000  15.497102   4710500
2018-0

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-09-28 00:00:00
original date:
2018-10-01 00:00:00
add!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-10-10 00:00:00
original date:
2018-10-11 00:00:00
add!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-10-10 00:00:00
original date:
2018-10-11 00:00:00
delete!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-11-05 00:00:00
original date:
2018-11-06 00:00:00
add!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-11-05 00:00:00
original date:
2018-11-06 00:00:00
delete!
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2018-11-12 00:00:00
original date:
2018-11-13 00:00:00
delete!
[*********************100%***********************]  1 of 1 completed
sani

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2019-11-20 00:00:00
original date:
2019-11-21 00:00:00
delete!
adjusted index out of range
18
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
18
exit price invalid
-1
price change is invalid
nan
[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2019-12-04 00:00:00
original date:
2019-12-05 00:00:00
delete!
adjusted index out of range
17
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
17
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
16
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
16
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
15
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
15
exit price invalid
-1
price change is invalid
nan
adjusted index out of range
14
exit pric

[*********************100%***********************]  1 of 1 completed
sanity check:
eff date:
2019-12-20 00:00:00
original date:
2019-12-23 00:00:00
regular!
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2019-12-02  28.559999  28.680000  28.129999  28.480000  28.480000   2566600
2019-12-03  28.040001  28.250000  27.879999  27.910000  27.910000   3548900
2019-12-04  27.959999  28.959999  27.959999  28.549999  28.549999   2493500
2019-12-05  28.620001  29.190001  28.530001  29.030001  29.030001   2240100
2019-12-06  29.330000  29.600000  28.959999  29.040001  29.040001   2228300
2019-12-09  28.959999  29.049999  28.650000  28.840000  28.840000   4139200
2019-12-10  28.850000  29.180000  28.730000  28.969999  28.969999   2138000
2019-12-11  29.000000  29.170000  28.799999  28.940001  28.940001   1863000
2019-12-12  29.030001  29.620001  28.840000  29.450001  29.450001   2210600
2019-12

In [31]:
def get_uniques(input_list):
    # insert the list to the set 
    unique_set = set(input_list) 
    # convert the set to the list 
    unique_list = (list(unique_set)) 
    return unique_list

In [32]:
'''
Calculate population statistics
'''

stats_df_columns = []
stats_df_columns.extend(strategy_attributes)
stats_df_columns.extend(['total', 'up', 'count', 'mean', 'std', 'max', 'min', 'sharpe', 'uppct'])

stats_df = pd.DataFrame(columns=stats_df_columns)

# get the unique values for each strategy attribute
strategy_attribute_unique_values = map(lambda strategy_attribute: get_uniques(df_output[strategy_attribute]), strategy_attributes)   

import itertools
# find strategies consisting of different combinations of unique values for each strategy attribute
strategies = list(itertools.product(*strategy_attribute_unique_values)) 

for strategy in strategies:
    # find the relevant rows for each strategy 
    relevant_rows = df_output.loc[(df_output[strategy_attributes]==strategy).all(axis=1), :]
    if len(relevant_rows) == 0:
        continue
    
    relevant_rows_sum = relevant_rows.loc[:, 'total':].sum(axis=0)
    mean = relevant_rows_sum['total'] / relevant_rows_sum['count']
    if relevant_rows_sum['count'] > 1:
        std = statistics.stdev(relevant_rows['total'])
    else:
        std = 0
    max_val = max(relevant_rows['total'])
    min_val = min(relevant_rows['total'])
    if std == 0:
        sharpe = mean / 0.00001 * np.sqrt(252)
    else:
        sharpe = mean / std * np.sqrt(252)
    uppct = relevant_rows_sum['up'] / relevant_rows_sum['count']
    strategy_dict = dict(zip(strategy_attributes, strategy)) 
    data = {'total': relevant_rows_sum['total'], 'up': relevant_rows_sum['up'], 'count': relevant_rows_sum['count'], 
            'mean': mean, 'std': std, 'max': max_val, 'min': min_val, 'sharpe': sharpe, 'uppct': uppct} 
    data.update(strategy_dict)
    stats_df = stats_df.append(pd.DataFrame([data]), ignore_index=True)


In [33]:
'''
Writing to excel
'''
writer = pd.ExcelWriter('sp500_analysis.xlsx', engine='xlsxwriter')
df_output.to_excel(writer, sheet_name='data')
stats_df.to_excel(writer, sheet_name='strategy_stats')
writer.save()