In [1]:
from equities_util import get_data, daily_risk_free_rate, TRADING_YR, window_addend
from tqdm import tqdm
import polars as pl
import pandas as pd

In [2]:
df = get_data(tickers='SPY', connected=False, use_polars = True)

# Windowing in DF

In [3]:
df = pd.DataFrame({
    "date": range(10,18),
    "close": [100, 102, 103, 101, 99, 103, 104, 107],    
})

df["win_2_0"] = ''
df["win_2_1"] = ''
df["win_2_2"] = ''
df['win_2_6'] = ''

df["win_7_0"] = ''
df["win_7_4"] = ''
df

Unnamed: 0,date,close,win_2_0,win_2_1,win_2_2,win_2_6,win_7_0,win_7_4
0,10,100,,,,,,
1,11,102,,,,,,
2,12,103,,,,,,
3,13,101,,,,,,
4,14,99,,,,,,
5,15,103,,,,,,
6,16,104,,,,,,
7,17,107,,,,,,


In [4]:
df.loc[0:1, 'win_2_0'] = 'X'
df.loc[1:2, 'win_2_1'] = 'X'
df.loc[2:3, 'win_2_2'] = 'X'
df.loc[6:7, 'win_2_6'] = 'X'

df.loc[0:6, 'win_7_0'] = 'X'
df.loc[1:8, 'win_7_4'] = 'X'

df

Unnamed: 0,date,close,win_2_0,win_2_1,win_2_2,win_2_6,win_7_0,win_7_4
0,10,100,X,,,,X,
1,11,102,X,X,,,X,X
2,12,103,,X,X,,X,X
3,13,101,,,X,,X,X
4,14,99,,,,,X,X
5,15,103,,,,,X,X
6,16,104,,,,X,X,X
7,17,107,,,,X,,X


# Automated Windowing (Intuitive)

In [5]:
window_sizes = list(range(2, len(df) + 1 ))
window_sizes, len(window_sizes)

([2, 3, 4, 5, 6, 7, 8], 7)

In [6]:
len(df)

8

In [7]:
window_indices = []

for window_size in window_sizes:
    
    for start_idx in range(0, len(df) - window_size + 1 ):
        
        end_idx = start_idx + ( window_size - 1)
        window_indices.append((window_size, start_idx, end_idx))
        # print(window_size, start_idx, end_idx)
        
    # print('')
len(window_indices)

28

In [8]:
df = pd.DataFrame({
    "date": range(10,18),
    "close": [100, 102, 103, 101, 99, 103, 104, 107],    
})

for window_index_tuple in window_indices:
    start_idx = window_index_tuple[1]
    end_idx = window_index_tuple[2]

    col_name = f"win_{window_index_tuple[0]}_{start_idx}"
    df[col_name] = ''*len(df)
    df.loc[start_idx:end_idx, col_name] = 'X'

df

Unnamed: 0,date,close,win_2_0,win_2_1,win_2_2,win_2_3,win_2_4,win_2_5,win_2_6,win_3_0,...,win_5_0,win_5_1,win_5_2,win_5_3,win_6_0,win_6_1,win_6_2,win_7_0,win_7_1,win_8_0
0,10,100,X,,,,,,,X,...,X,,,,X,,,X,,X
1,11,102,X,X,,,,,,X,...,X,X,,,X,X,,X,X,X
2,12,103,,X,X,,,,,X,...,X,X,X,,X,X,X,X,X,X
3,13,101,,,X,X,,,,,...,X,X,X,X,X,X,X,X,X,X
4,14,99,,,,X,X,,,,...,X,X,X,X,X,X,X,X,X,X
5,15,103,,,,,X,X,,,...,,X,X,X,X,X,X,X,X,X
6,16,104,,,,,,X,X,,...,,,X,X,,X,X,X,X,X
7,17,107,,,,,,,X,,...,,,,X,,,X,,X,X


In [9]:
results = []
df = pd.DataFrame({
    "date": range(10,18),
    "close": [100, 102, 103, 101, 99, 103, 104, 107],    
})

for window_index_tuple in window_indices:
    start_idx = window_index_tuple[1]
    end_idx = window_index_tuple[2]

    col_name = f"win_{window_index_tuple[0]}_{start_idx}"
    df[col_name] = ''*len(df)
    df.loc[start_idx:end_idx, col_name] = ( df.loc[start_idx:end_idx, 'close'].pct_change() + 1 ).cumprod() - 1

    results.append({
        "win_sz" : window_index_tuple[0],
        "start_date" : df.loc[start_idx]['date'],
        "end_date" : df.loc[end_idx]['date'],
        "gains" : df.loc[end_idx][col_name]
    })

df

Unnamed: 0,date,close,win_2_0,win_2_1,win_2_2,win_2_3,win_2_4,win_2_5,win_2_6,win_3_0,...,win_5_0,win_5_1,win_5_2,win_5_3,win_6_0,win_6_1,win_6_2,win_7_0,win_7_1,win_8_0
0,10,100,,,,,,,,,...,,,,,,,,,,
1,11,102,0.02,,,,,,,0.02,...,0.02,,,,0.02,,,0.02,,0.02
2,12,103,,0.009804,,,,,,0.03,...,0.03,0.009804,,,0.03,0.009804,,0.03,0.009804,0.03
3,13,101,,,-0.019417,,,,,,...,0.01,-0.009804,-0.019417,,0.01,-0.009804,-0.019417,0.01,-0.009804,0.01
4,14,99,,,,-0.019802,,,,,...,-0.01,-0.029412,-0.038835,-0.019802,-0.01,-0.029412,-0.038835,-0.01,-0.029412,-0.01
5,15,103,,,,,0.040404,,,,...,,0.009804,0.0,0.019802,0.03,0.009804,0.0,0.03,0.009804,0.03
6,16,104,,,,,,0.009709,,,...,,,0.009709,0.029703,,0.019608,0.009709,0.04,0.019608,0.04
7,17,107,,,,,,,0.028846,,...,,,,0.059406,,,0.038835,,0.04902,0.07


In [10]:
pd.DataFrame(results).sort_values(["gains", "win_sz"], ascending=[False, True]).head(3)

Unnamed: 0,win_sz,start_date,end_date,gains
17,4,14,17,0.080808
27,8,10,17,0.07
21,5,13,17,0.059406


# Time Complexity

In [11]:
# 10 years
df = range(252 * 10)

window_sizes = list(range(2, len(df) + 1 ))

len(window_sizes) * len(df) // 2

3173940

# A Better Way

In [12]:
df = pd.DataFrame({
    "date": range(10,20),
    "close": [100, 102, 103, 101, 99, 103, 104, 107, 111, 97],    
})

df

Unnamed: 0,date,close
0,10,100
1,11,102
2,12,103
3,13,101
4,14,99
5,15,103
6,16,104
7,17,107
8,18,111
9,19,97


In [13]:
window_sizes = list(range(2, len(df) + 1))

len(window_sizes) * len(df) // 2

45

# Calc All Starting Days - n calculations

In [14]:
window_sizes = list(range(2, len(df) + 1 ))
len(window_sizes)

9

In [15]:
len(df)

10

In [16]:
window_sizes

[2, 3, 4, 5, 6, 7, 8, 9, 10]

In [17]:
for start_idx in range(0,len(df)-1):

    gains = (( df.loc[start_idx:, 'close'].pct_change() + 1 ).cumprod() - 1).round(2)
    df.loc[start_idx:, f'gains_{start_idx}'] = gains
    gains.dropna()
df

Unnamed: 0,date,close,gains_0,gains_1,gains_2,gains_3,gains_4,gains_5,gains_6,gains_7,gains_8
0,10,100,,,,,,,,,
1,11,102,0.02,,,,,,,,
2,12,103,0.03,0.01,,,,,,,
3,13,101,0.01,-0.01,-0.02,,,,,,
4,14,99,-0.01,-0.03,-0.04,-0.02,,,,,
5,15,103,0.03,0.01,0.0,0.02,0.04,,,,
6,16,104,0.04,0.02,0.01,0.03,0.05,0.01,,,
7,17,107,0.07,0.05,0.04,0.06,0.08,0.04,0.03,,
8,18,111,0.11,0.09,0.08,0.1,0.12,0.08,0.07,0.04,
9,19,97,-0.03,-0.05,-0.06,-0.04,-0.02,-0.06,-0.07,-0.09,-0.13


In [1]:
results = []

for start_idx in range(0,len(df)-1):

    gains = ( df.loc[start_idx:, 'close'].pct_change() + 1 ).cumprod() - 1
    
    df.loc[start_idx:, f'gains_{start_idx}'] = gains
    
    gains.dropna()
    
    for window_size in window_sizes:
        
        if start_idx+window_size-1 >= len(df):
            continue                
        
        print("indices", start_idx, start_idx+window_size-1, " sz", window_size, end=" ")
        
        last_gains = gains.loc[start_idx+window_size-1].item()
        
        print(last_gains)

        results.append({
            "win_sz" : window_size,
            "start_date" : df.loc[start_idx]['date'].item(),
            "end_date" : df.loc[start_idx+window_size-1]['date'].item(),
            "gains" : gains.loc[start_idx+window_size-1].item()
        })
        
    print('')
        
    

NameError: name 'df' is not defined

In [19]:
pd.DataFrame(results).sort_values(["gains", "win_sz"], ascending=[False, True]).head(3)

Unnamed: 0,win_sz,start_date,end_date,gains
33,5,14.0,18.0,0.121212
7,9,10.0,18.0,0.11
28,6,13.0,18.0,0.09901


# Test on SPY

In [2]:
def get_data(tickers: str = None):
    import yfinance as yf
    import pandas as pd

    df = yf.download(tickers,
                     period='max',
                     interval='1d',
                     # start="2020-12-01",
                     # end="2023-12-31",
                     group_by="ticker",
                     back_adjust=True,
                     progress=False)

    df = df.stack(level=0, future_stack=True).reset_index()
    df.columns = [col.lower() for col in df.columns]
    df = df.dropna()
    df = df[["date", "ticker", "close"]]
    df["date"] = pd.to_datetime(df["date"].dt.date)
    df = df.sort_values(["ticker", "date"], ascending=[True, True])
    df = df.reset_index(drop=True)
    return df

spy_df = get_data(tickers = ['SPY'])
spy_df

Unnamed: 0,date,ticker,close
0,1993-01-29,SPY,43.937500
1,1993-02-01,SPY,44.250000
2,1993-02-02,SPY,44.343750
3,1993-02-03,SPY,44.812500
4,1993-02-04,SPY,45.000000
...,...,...,...
8009,2024-11-19,SPY,590.299988
8010,2024-11-20,SPY,590.500000
8011,2024-11-21,SPY,593.669983
8012,2024-11-22,SPY,595.510010


In [7]:
TRADING_YR = 252
window_addend = TRADING_YR // 2

In [9]:
window_sizes = list(range(TRADING_YR, len(spy_df), window_addend))
window_sizes.append(len(spy_df))
print(window_sizes)

[252, 378, 504, 630, 756, 882, 1008, 1134, 1260, 1386, 1512, 1638, 1764, 1890, 2016, 2142, 2268, 2394, 2520, 2646, 2772, 2898, 3024, 3150, 3276, 3402, 3528, 3654, 3780, 3906, 4032, 4158, 4284, 4410, 4536, 4662, 4788, 4914, 5040, 5166, 5292, 5418, 5544, 5670, 5796, 5922, 6048, 6174, 6300, 6426, 6552, 6678, 6804, 6930, 7056, 7182, 7308, 7434, 7560, 7686, 7812, 7938, 8014]


### Risk Free Rate

In [14]:
# 10 year
0.04304

0.04304

In [15]:
daily_risk_free_rate = 0.0000167222

((1 + daily_risk_free_rate) ** (252*10)) - 1

0.043040068291641864

---
### 35 seconds - pct_change 

In [16]:
from tqdm import tqdm

results = []

spy_df['pct_change'] = spy_df['close'].pct_change()

for start_idx in tqdm(range(0,len(spy_df)-1)):
    
    # roi
    gains  = ( 1 +  spy_df.loc[start_idx:, 'pct_change'] ).cumprod() - 1
    
    # mdd
    mdd    = ((spy_df.loc[start_idx:, 'close'] - (spy_df.loc[start_idx:, 'close'].cummax())) / spy_df.loc[start_idx:, 'close'].cummax()).cummin()
    
    # sharpe
    sharpe = (spy_df.loc[start_idx:, 'pct_change'] - daily_risk_free_rate).expanding().mean() / spy_df.loc[start_idx:, 'pct_change'].expanding().std()
    
    for window_size in window_sizes:
        
        if start_idx+window_size-1 >= len(spy_df):
            continue                
            
        last_gains = gains.loc[start_idx+window_size-1].item()

        results.append({
            "win_sz" : window_size,
            "start_date" : spy_df.loc[start_idx]['date'],
            "end_date" : spy_df.loc[start_idx+window_size-1]['date'],
            "roi" : gains.loc[start_idx+window_size-1].item(),
            "mdd": mdd.loc[start_idx+window_size-1].item(),
            "sharpe": sharpe.loc[start_idx+window_size-1].item()
        })

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8013/8013 [00:47<00:00, 167.65it/s]


In [33]:
pd.DataFrame(results).to_csv("SPY_backtest_results.csv", index=False)

---
### 33, 36 seconds

In [18]:
results = []

df['pct_change'] = df['close'].pct_change()

for start_idx in tqdm(range(0,len(df)-1)):
    
    # close
    close_col = df.loc[start_idx:, 'close']

    # pct_change
    pct_change = df.loc[start_idx:, 'pct_change']
    
    # roi
    gains  = ( 1 +  pct_change ).cumprod() - 1
    
    # mdd
    mdd    = ((close_col - ( close_col.cummax() )) / close_col.cummax()).cummin()
    
    # sharpe
    sharpe = (pct_change - daily_risk_free_rate).expanding().mean() / pct_change.expanding().std()
    
    for window_size in window_sizes:
        
        if start_idx+window_size-1 >= len(df):
            continue                
            
        last_gains = gains.loc[start_idx+window_size-1].item()

        results.append({
            "win_sz" : window_size,
            "start_date" : df.loc[start_idx]['date'],
            "end_date" : df.loc[start_idx+window_size-1]['date'],
            "gains" : gains.loc[start_idx+window_size-1].item()
        })

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8002/8002 [00:35<00:00, 227.41it/s]


# Polars

In [24]:
df = get_data(tickers='SPY', connected=False, use_polars = True)
df.head(3)

date,close
date,f64
1993-01-29,43.9375
1993-02-01,44.25
1993-02-02,44.34375


In [25]:
window_sizes = list(range(TRADING_YR, len(df), window_addend))
window_sizes.append(len(df))
window_sizes[:3],window_sizes[-4:]

([252, 378, 504], [7686, 7812, 7938, 8003])

In [None]:
results = []

df['pct_change'] = df['close'].pct_change()

for start_idx in tqdm(range(0,len(df)-1)):
    
    # close
    close_col = df.loc[start_idx:, 'close']

    # pct_change
    pct_change = df.loc[start_idx:, 'pct_change']
    
    # roi
    gains  = ( 1 +  pct_change ).cumprod() - 1
    
    # mdd
    mdd    = ((close_col - ( close_col.cummax() )) / close_col.cummax()).cummin()
    
    # sharpe
    sharpe = (pct_change - daily_risk_free_rate).expanding().mean() / pct_change.expanding().std()
    
    for window_size in window_sizes:
        
        if start_idx+window_size-1 >= len(df):
            continue                
            
        last_gains = gains.loc[start_idx+window_size-1].item()

        results.append({
            "win_sz" : window_size,
            "start_date" : df.loc[start_idx]['date'],
            "end_date" : df.loc[start_idx+window_size-1]['date'],
            "gains" : gains.loc[start_idx+window_size-1].item()
        })

In [54]:
results = []

df = df.with_columns(pl.col("close").pct_change().alias("pct_change"))

for start_idx in tqdm(range(0,len(df)-1)):
    df_verticals = df.tail(len(df) - start_idx).with_columns
    (
        ( ( 1 + pl.col("pct_change") ).cum_prod() - 1 ).alias(f"cumulative_roi_{start_idx}"),
        
        (pl.col("close") - (pl.col("close").cum_max())).alias(f"rolling_max_drawdown_{start_idx}"),    

        ((pl.col("pct_change") - daily_risk_free_rate).cumulative_eval(pl.element().mean())
         / pl.col("pct_change").cumulative_eval(pl.element().std())
        ).alias(f"sharpe_ratio_{i}")
    )
    
    display(df_verticals)
    
    for window_size in window_sizes:
        
        if start_idx+window_size-1 >= len(df):
            continue                
            
        last_gains = gains.loc[start_idx+window_size-1].item()

        results.append({
            "win_sz" : window_size,
            "start_date" : df.loc[start_idx]['date'],
            "end_date" : df.loc[start_idx+window_size-1]['date'],
            "gains" : gains.loc[start_idx+window_size-1].item()
        })
    
    break

  0%|                                                                                                                                   | 0/8002 [00:00<?, ?it/s]

<bound method DataFrame.with_columns of shape: (8_003, 3)
┌────────────┬────────────┬────────────┐
│ date       ┆ close      ┆ pct_change │
│ ---        ┆ ---        ┆ ---        │
│ date       ┆ f64        ┆ f64        │
╞════════════╪════════════╪════════════╡
│ 1993-01-29 ┆ 43.9375    ┆ null       │
│ 1993-02-01 ┆ 44.25      ┆ 0.007112   │
│ 1993-02-02 ┆ 44.34375   ┆ 0.002119   │
│ 1993-02-03 ┆ 44.8125    ┆ 0.010571   │
│ 1993-02-04 ┆ 45.0       ┆ 0.004184   │
│ …          ┆ …          ┆ …          │
│ 2024-11-04 ┆ 569.809998 ┆ -0.002154  │
│ 2024-11-05 ┆ 576.700012 ┆ 0.012092   │
│ 2024-11-06 ┆ 591.039978 ┆ 0.024866   │
│ 2024-11-07 ┆ 595.609985 ┆ 0.007732   │
│ 2024-11-08 ┆ 598.190002 ┆ 0.004332   │
└────────────┴────────────┴────────────┘>

  0%|                                                                                                                                   | 0/8002 [00:00<?, ?it/s]


KeyError: 251

In [None]:
results = []

df = df.with_columns(pl.col("close").pct_change().alias("pct_change"))

for start_idx in tqdm(range(0,len(df)-1)):
    
    df = df.join(
        df.tail(len(df) - i).with_columns(
            (pl.col("ret_pls_one").cum_prod() - 1).alias(f"cumulative_roi_{start_idx}"),
            (pl.col("close") - (pl.col("close").cum_max())).alias(f"rolling_max_drawdown_{start_idx}"),
            ((pl.col("pct_change") - risk_free_rate).cumulative_eval(pl.element().mean())
             / (pl.col("pct_change") - risk_free_rate).cumulative_eval(pl.element().std())
            ).alias(f"sharpe_ratio_{i}")
            
        ).select(
            ["date", 
             f"cumulative_roi_{start_idx}", 
             f"rolling_max_drawdown_{start_idx}", 
             f"sharpe_ratio_{start_idx}"
            ])
        , on=["date"], how="left")

In [None]:
for start_idx in tqdm(range(0,len(df)-1)):
    
    # close
    close_col = df.loc[start_idx:, 'close']

In [26]:
i = 0
# df.slice(start_idx, end_idx)
df.with_columns(
    
    # returns
    (
        (
            pl.col("close").pct_change() + 1
        ).cum_prod() - 1
    ).alias(f"cumulative_roi_{i}"),
    
    # mdd
    (
        pl.col("close") - (
            pl.col("close").cum_max()
        )
    ).alias(f"rolling_max_drawdown_{i}"),
    
    # sharpe
    (
        ( pl.col("close").pct_change() - daily_risk_free_rate ).cumulative_eval( pl.element().mean() ) /
        pl.col("close").pct_change().cumulative_eval( pl.element().std()  )
    ).alias(f"sharpe_ratio_{i}")
)

date,close,cumulative_roi_0,rolling_max_drawdown_0,sharpe_ratio_0
date,f64,f64,f64,f64
1993-01-29,43.9375,,0.0,
1993-02-01,44.25,0.007112,0.0,
1993-02-02,44.34375,0.009246,0.0,1.30236
1993-02-03,44.8125,0.019915,0.0,1.549413
1993-02-04,45.0,0.024182,0.0,1.627637
…,…,…,…,…
2024-11-04,569.809998,11.968649,-14.780029,0.031712
2024-11-05,576.700012,12.125463,-7.890015,0.031836
2024-11-06,591.039978,12.451834,0.0,0.03209
2024-11-07,595.609985,12.555846,0.0,0.032169
