In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from pandas_datareader import data as pdr
yf.pdr_override() # <== that's all it takes :-)
import numpy as np
import statistics
from pprint import pprint
from math import isnan

sp500_history = pd.read_csv('./sp500_history.csv')
sp500_history['date'] = pd.to_datetime(sp500_history['date']) 
sp500_history = sp500_history[['date', 'value', 'variable']]

start_date = datetime.strptime('2016-1-1', '%Y-%m-%d')
end_date = datetime.strptime('2020-12-31', '%Y-%m-%d')

# greater than the start date and smaller than the end date
mask = (sp500_history['date'] > start_date) & (sp500_history['date'] <= end_date)
sp500_history = sp500_history.loc[mask]

# Sort
sp500_history.sort_values(['date', 'variable'], ascending=[True, True])

  from pandas.util.testing import assert_frame_equal


Unnamed: 0,date,value,variable
261,2016-01-05,WLTW,added_ticker
260,2016-01-05,FOSL,removed_ticker
262,2016-01-19,EXR,added_ticker
263,2016-01-19,CB,removed_ticker
264,2016-02-01,FRT,added_ticker
...,...,...,...
453,2019-12-23,ZBRA,added_ticker
454,2019-12-23,STE,added_ticker
449,2019-12-23,AMG,removed_ticker
451,2019-12-23,MAC,removed_ticker


In [2]:
calendar = pd.read_excel('./Nasdaq_Trading_Calendar.xlsx', sheet_name=None)
rebal_dates = []
for year in calendar:
    sheet = calendar[year]
    mask = sheet['S&P Indexes Rebalance S&P 500, S&P 400, and S&P 600'] == 1
    year_rebal_dates = sheet.loc[mask]
    for date in year_rebal_dates['Date'].values:
        rebal_dates.append(date)
pprint(rebal_dates)

[numpy.datetime64('2020-03-20T00:00:00.000000000'),
 numpy.datetime64('2020-06-19T00:00:00.000000000'),
 numpy.datetime64('2020-09-18T00:00:00.000000000'),
 numpy.datetime64('2020-12-18T00:00:00.000000000'),
 numpy.datetime64('2019-03-15T00:00:00.000000000'),
 numpy.datetime64('2019-06-21T00:00:00.000000000'),
 numpy.datetime64('2019-09-20T00:00:00.000000000'),
 numpy.datetime64('2019-12-20T00:00:00.000000000'),
 numpy.datetime64('2018-03-16T00:00:00.000000000'),
 numpy.datetime64('2018-06-15T00:00:00.000000000'),
 numpy.datetime64('2018-09-21T00:00:00.000000000'),
 numpy.datetime64('2018-12-21T00:00:00.000000000'),
 numpy.datetime64('2017-03-17T00:00:00.000000000'),
 numpy.datetime64('2017-06-16T00:00:00.000000000'),
 numpy.datetime64('2017-09-15T00:00:00.000000000'),
 numpy.datetime64('2017-12-15T00:00:00.000000000'),
 numpy.datetime64('2016-03-18T00:00:00.000000000'),
 numpy.datetime64('2016-06-17T00:00:00.000000000'),
 numpy.datetime64('2016-09-16T00:00:00.000000000'),
 numpy.datet

In [9]:
# rebal_type is 'regular' or 'ad_hoc'
# add_delete is 'add' or 'delete'
# entry_date and exit_date are in terms of number of days before (-) or after (+) effective date
# entry_time and exit_time are 'Open' or 'Close'
strategy_attributes = ['rebal_type', 'add_delete', 'entry_date', 'entry_time', 'exit_date', 'exit_time']

output_columns = ['eff_date', 'original_date']
output_columns.extend(strategy_attributes)
output_columns.extend(['total', 'up', 'count'])

df_output = pd.DataFrame(columns=output_columns)

In [4]:
def update_output_table(df_output, eff_date, original_date, rebal_type, add_delete, 
                        entry_date, entry_time, exit_date, exit_time, price_change):
    if price_change > 0:
        up = 1
    else:
        up = 0

    conditions = ((df_output['eff_date']==eff_date) & 
                 (df_output['rebal_type']==rebal_type) & 
                 (df_output['add_delete']==add_delete) &
                (df_output['entry_date']==entry_date) & 
                (df_output['entry_time']==entry_time) & 
                (df_output['exit_date']==exit_date) & 
                (df_output['exit_time']==exit_time))

    # new row
    if not conditions.any():
        data = [{'eff_date': eff_date, 'rebal_type': rebal_type, 'add_delete': add_delete, 
                 'entry_date': entry_date, 'entry_time': entry_time, 
                 'exit_date': exit_date, 'exit_time': exit_time, 
                 'total': price_change, 'up': up, 'count': 1, 'original_date': original_date}]
        df_output = df_output.append(pd.DataFrame(data), ignore_index=True)

    # updating existing row
    else:
        df_output.loc[conditions, 'total'] += price_change
        df_output.loc[conditions, 'up'] += up
        df_output.loc[conditions, 'count'] += 1
    
    return df_output

In [10]:
days_bef_aft = 20 # the range of days before and after effective date to pull from Yahoo finance

exit_time = 'Close'

times_of_day = ['Open', 'Close']

for idx, row in sp500_history.iterrows():
    
    # End date is exclusive, so need to increase by 1
    prices = pdr.get_data_yahoo(row['value'], start=row['date'] - timedelta(days=days_bef_aft), end=row['date'] + timedelta(days=days_bef_aft + 1))
    if len(prices) == 0:
        continue
        
    # Find the adjusted effective date 
    # which is one trading day before the wikipedia effective (i.e. market opening) date
    eff_date_index = prices.index.get_loc(row['date'],method='pad')
    if eff_date_index < 0:
#         print('SOMETHINGS WRONG')
#         print(prices)
#         print(row['date'])
        continue
    eff_date = prices.iloc[eff_date_index].name
        
        
    '''
    Loop of different entry and exit dates
    '''
    entry_date = max(-4, -eff_date_index)
#     exit_date = min(4, len(prices) - 1)
    exit_date = 0
    while entry_date < 0:
        for time_of_day in times_of_day:
            entry_time = time_of_day
                    
            # Find the entry price
            entry_date_index = eff_date_index + entry_date
            if entry_date_index < 0:
                continue
            entry_price = prices.iloc[entry_date_index][entry_time]
            if isnan(entry_price):
                continue

            # Find the exit price
            exit_date_index = eff_date_index + exit_date
            if exit_date_index >= len(prices):
                continue
            exit_price = prices.iloc[exit_date_index][exit_time]
            if isnan(exit_price):
                continue

            price_change = exit_price / entry_price - 1 # percentage change in price between entry and exit
            
#             if price_change == 0:
#                 print('WHY EQUAL')
#                 print(prices)
#                 print(entry_price)
#                 print(exit_price)
            
            if eff_date in rebal_dates:
                rebal_type = 'regular'
            else:
                rebal_type = 'ad_hoc'

            if row['variable'] == 'added_ticker':
                add_delete = 'add'
            else:
                add_delete = 'delete'
                
            df_output = update_output_table(df_output, eff_date, row['date'], rebal_type, add_delete, 
                        entry_date, entry_time, exit_date, exit_time, price_change)

        entry_date += 1


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- BRCM: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- PCP: Data doesn't exist for startDate = 1452528000, endDate = 1456070400
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
WHY EQUAL
               Open     High      Low    Close  Adj Close  Volume
Date                                                 

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- ADT: Data doesn't exist for startDate = 1460476800, endDate = 1464019200
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- SNDK: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- TWC: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- ARG: Data doesn't exist for startDate = 1462204800, endDate = 1465747200
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download

WHY EQUAL
               Open     High      Low    Close  Adj Close  Volume
Date                                                             
2016-09-01  28800.0  28800.0  28800.0  28800.0    28800.0       0
2016-09-02  28800.0  28800.0  28800.0  28800.0    28800.0       0
2016-09-06  30000.0  30000.0  30000.0  30000.0    30000.0       0
2016-09-07  28000.0  28000.0  28000.0  28000.0    28000.0      20
2016-09-08  28000.0  28000.0  28000.0  28000.0    28000.0       0
2016-09-09  28000.0  28000.0  28000.0  28000.0    28000.0      50
2016-09-12  28000.0  28000.0  28000.0  28000.0    28000.0       0
2016-09-13  28000.0  28000.0  28000.0  28000.0    28000.0       0
2016-09-14  28000.0  28000.0  28000.0  28000.0    28000.0       0
2016-09-15  28000.0  28000.0  28000.0  28000.0    28000.0      70
2016-09-16  28000.0  28000.0  28000.0  28000.0    28000.0       0
2016-09-19  28000.0  28000.0  28000.0  28000.0    28000.0      70
2016-09-20  28000.0  28000.0  28000.0  28000.0    28000.0       0


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- STJ: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- SE: Data doesn't exist for startDate = 1486483200, endDate = 1490025600
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 com

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- MJN: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*****************

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- WYN: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
WHY EQUAL
                 Open        High        Low      Close  Adj Close    Volume


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
WHY EQUAL
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2018-10-16  43.900002  43.980000  43.730000  43.810001  43.810001   3980616
2018-10-17  43.799999  44.000000  43.740002  43.980000  43.980000   4942684
2018-10-18  43.980000  44.005001  43.849998  43.900002  43.900002   4021622
2018-10-19  44.000000  44.080002  43.889999  43.959999  43.959999   3833727
2018-10-22  43.880001  44.009998  43.869999  43.919998  43.919998   2270704
2018-10-23  43.849998  44.040001  43.820000  43.880001  43.880001   4754687
2018-10-24  43.900002  43.939999  43.590000  43.610001  43.610001   8195073
2018-10-25  43.799999  43.980000  43.700001  43.900002  43.900002   3181145
2018-10-26  43.790001  44.049999  43.720001  43.959999  43.959999   5029549
2018-10-29  44.1

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
WHY EQUAL
                  Open        High        Low       Close   Adj Close  \
Date                                                                    
2018-12-03  101.510002  101.730003  99.500000  100.639999  100.639999   
2018-12-04  100.639999  100.949997  98.730003   98.870003   98.870003   
2018-12-06   97.470001   99.096901  96.029999   97.980003   97.980003   
2018-12-07   97.699997   98.570000  95.529999   95.849998   95.849998   
2018-12-10   96.279999   96.589996  94.550003   96.010002   96.010002   
2018-12-11   97.080002   97.540001  95.699997   96.419998   96.41

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
WHY EQUAL
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2019-02-06  21.100000  21.309999  20.950001  21.010000  20.154078   2851000
2019-02-07  20.879999  20.950001  20.340000  20.549999  19.712818   3740200
2019-02-08  19.830000  20.180000  18.490000  18.690001  17.928593  14091400
2019-02-11  18.530001  18.740000  17.879999  18.650000  17.890221   9734000
2019-02-12  18.820000  18.889999  18.379999  18.520000  17.765518   5095900
2019-02-13  18.530001  18.799999  18.450001 

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [6]:
def get_uniques(input_list):
    # insert the list to the set 
    unique_set = set(input_list) 
    # convert the set to the list 
    unique_list = (list(unique_set)) 
    return unique_list

In [7]:
'''
Calculate population statistics
'''

stats_df_columns = []
stats_df_columns.extend(strategy_attributes)
stats_df_columns.extend(['total', 'up', 'count', 'mean', 'std', 'max', 'min', 'sharpe', 'uppct'])

stats_df = pd.DataFrame(columns=stats_df_columns)

# get the unique values for each strategy attribute
strategy_attribute_unique_values = map(lambda strategy_attribute: get_uniques(df_output[strategy_attribute]), strategy_attributes)   

import itertools
# find strategies consisting of different combinations of unique values for each strategy attribute
strategies = list(itertools.product(*strategy_attribute_unique_values)) 

for strategy in strategies:
    # find the relevant rows for each strategy 
    relevant_rows = df_output.loc[(df_output[strategy_attributes]==strategy).all(axis=1), :]
    if len(relevant_rows) == 0:
        continue
    
    relevant_rows_sum = relevant_rows.loc[:, 'total':].sum(axis=0)
    mean = relevant_rows_sum['total'] / relevant_rows_sum['count']
    if relevant_rows_sum['count'] > 1:
        std = statistics.stdev(relevant_rows['total'])
    else:
        std = 0
    max_val = max(relevant_rows['total'])
    min_val = min(relevant_rows['total'])
    if std == 0:
        sharpe = mean / 0.00001 * np.sqrt(252)
    else:
        sharpe = mean / std * np.sqrt(252)
    uppct = relevant_rows_sum['up'] / relevant_rows_sum['count']
    strategy_dict = dict(zip(strategy_attributes, strategy)) 
    data = {'total': relevant_rows_sum['total'], 'up': relevant_rows_sum['up'], 'count': relevant_rows_sum['count'], 
            'mean': mean, 'std': std, 'max': max_val, 'min': min_val, 'sharpe': sharpe, 'uppct': uppct} 
    data.update(strategy_dict)
    stats_df = stats_df.append(pd.DataFrame([data]), ignore_index=True)


In [8]:
'''
Writing to excel
'''
writer = pd.ExcelWriter('sp500_analysis.xlsx', engine='xlsxwriter')
df_output.to_excel(writer, sheet_name='data')
stats_df.to_excel(writer, sheet_name='strategy_stats')
writer.save()