In [5]:
import sf_quant as sf
import polars as pl
import datetime as dt
import numpy as np
import scipy as sp



In [6]:
start = dt.date(1996, 1, 1)
end = dt.date(2024, 12, 31)

columns = [
    'date',
    'barrid',
    'price',
    'return',
    'specific_return',
]

df = pl.read_parquet("../russell_3000_daily.parquet")

In [7]:
#Helper program to find stocks that have returns over the whole range of dates. I need to simulate the signals somehow.

startDate = dt.date(2013, 1, 1) 
endDate = dt.date(2018, 1, 1)

unique_barrids = df.filter(
    pl.col("date").is_between(startDate, endDate)
).select(pl.col("barrid")).unique()

barrid = unique_barrids.sample(1)[0, 0]
print(barrid)

print(df.filter(
    pl.col("date").is_between(startDate, endDate),
    (pl.col("barrid") == barrid),
).select(
    pl.col("date").alias("date"),
    pl.col("return").alias("return")
))


USAMUH1
shape: (1_114, 2)
┌────────────┬─────────┐
│ date       ┆ return  │
│ ---        ┆ ---     │
│ date       ┆ f64     │
╞════════════╪═════════╡
│ 2013-07-31 ┆ 0.0     │
│ 2013-08-01 ┆ 2.1384  │
│ 2013-08-02 ┆ 0.551   │
│ 2013-08-05 ┆ -0.5479 │
│ 2013-08-06 ┆ -1.8182 │
│ …          ┆ …       │
│ 2017-12-22 ┆ 0.0     │
│ 2017-12-26 ┆ -0.0867 │
│ 2017-12-27 ┆ 0.0434  │
│ 2017-12-28 ┆ 0.564   │
│ 2017-12-29 ┆ -0.5177 │
└────────────┴─────────┘


In [8]:
barrid1 = "USAQ392"
barrid2 = "USAZ6Q1"
barrid3 = "USAROU1"


training_data = df.filter(
    pl.col("date").is_between(startDate, endDate),
    (pl.col("barrid") == barrid1) | (pl.col("barrid") == barrid2) | (pl.col("barrid") == barrid3)
).pivot("barrid", index="date", values="return").select(
    pl.col("date"),
    (pl.col(barrid1)/100).alias("returns1"),
    (pl.col(barrid2)/100).alias("returns2"),
    (pl.col(barrid3)/100).alias("returns3")
)

print(training_data)

#cum_returns = trainingdata.select(pl.col("date"), np.log(pl.col("return") + 1).cum_sum().alias("cumulative_returns"))

#print(cum_returns.select(pl.col("date"), (np.exp(pl.col("cumulative_returns"))-1).alias("Cumulative_Returns")))


shape: (1_259, 4)
┌────────────┬───────────┬───────────┬───────────┐
│ date       ┆ returns1  ┆ returns2  ┆ returns3  │
│ ---        ┆ ---       ┆ ---       ┆ ---       │
│ date       ┆ f64       ┆ f64       ┆ f64       │
╞════════════╪═══════════╪═══════════╪═══════════╡
│ 2013-01-02 ┆ 0.005376  ┆ 0.022719  ┆ 0.014524  │
│ 2013-01-03 ┆ 0.032086  ┆ -0.008599 ┆ -0.003937 │
│ 2013-01-04 ┆ 0.036269  ┆ 0.010481  ┆ -0.017248 │
│ 2013-01-07 ┆ -0.0002   ┆ 0.010372  ┆ 0.001097  │
│ 2013-01-08 ┆ -0.002801 ┆ 0.007788  ┆ 0.011687  │
│ …          ┆ …         ┆ …         ┆ …         │
│ 2017-12-22 ┆ -0.033333 ┆ 0.000254  ┆ 0.007785  │
│ 2017-12-26 ┆ 0.0       ┆ 0.002284  ┆ 0.002107  │
│ 2017-12-27 ┆ 0.0       ┆ -0.000506 ┆ -0.001402 │
│ 2017-12-28 ┆ -0.002463 ┆ -0.003799 ┆ 0.009123  │
│ 2017-12-29 ┆ -0.001235 ┆ -0.005594 ┆ 0.004172  │
└────────────┴───────────┴───────────┴───────────┘


In [None]:
def cov_weights_error(cov_est, cov_real, exp_returns):
    # cov_est and cov_true are 3x3 lists, mu is vector
    #cov_est = np.array(cov_est, dtype=float)
    #cov_real = np.array(cov_real, dtype=float)
    #exp_returns = np.array(exp_returns, dtype=float)
    if (np.linalg.det(cov_est) == 0): return 0
    if (np.linalg.det(cov_real) == 0): return 0

    x_est = np.linalg.solve(cov_est, exp_returns)
    x_real = np.linalg.solve(cov_real, exp_returns)
    return np.linalg.norm((x_est / np.sum(x_est)) - (x_real / np.sum(x_real)))


In [None]:

def cov_est_error(r, b): #ratio for geometric weighting.
    error = 0.01 #approximately the contribution of the last day included in the average.
    
    num_r = int(np.floor_divide(np.log(error), np.log(r))) #num_rber of days computed in the average.
    weights_r = [(r**i) for i in range(1, num_r + 1)] #weights_r for the rolling sum. Note they are automatially normalized.
    
    num_b = int(np.floor_divide(np.log(error), np.log(b))) #num_rber of days computed in the average.
    weights_b = [(b**i) for i in range(1, num_b + 1)] #weights_r for the rolling sum. Note they are automatially normalized.

    exprs_cov_construction = [pl.col("date").alias("date")] #Expressions for constructing columns representing covariance matrix coefficients.
    for i in range(1, 4):
        for j in range(1, 4):
            exprs_cov_construction.append(
                ((pl.col(f"returns{i}") - pl.col(f"mu{i}")) *
                (pl.col(f"returns{j}") - pl.col(f"mu{j}"))).alias(f"cov{i}{j}")
            )

    exprs_cov_est = [] #Expressions for constructing estimated covariance matrix coefficients.
    for i in range(1, 4):
        for j in range(1, 4):
            exprs_cov_est.append(
                pl.col(f"cov{i}{j}").rolling_mean(num_r, weights_r).fill_null(strategy="backward").alias(f"cov_est{i}{j}")
            )
    
    exprs_cov_next = []
    for i in range(1, 4):
        for j in range(1, 4):
            exprs_cov_next.append(
                (pl.col(f"cov{i}{j}").shift(-1)).fill_null(strategy="forward").alias(f"cov_next{i}{j}")
            )

    cov_data = training_data.with_columns(
        pl.col("returns1").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu1"),
        pl.col("returns2").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu2"),
        pl.col("returns3").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu3")
    ).with_columns( #Need the mu's to compute the covariance matrices.
        exprs_cov_construction
    ).with_columns( #Need the covariance matrix coefficients to compute the estimated future covariance matrix.
        exprs_cov_est
    ).with_columns(
        exprs_cov_next
    ).filter(
        pl.col("date").is_between(startDate + dt.timedelta(num_r + num_b), endDate)
        #rolling_mean introduces a null in a row when there are too many weights_r for the num_rber of available elements. 
        #this is just filtering out backfilled nulls from two rolling_mean steps.
    )

    #print(cov_data.select(pl.col("cov_next11"))) #USE THIS TO SEE NULL WHEN EXPRS_COV_NEXT HAS NO FORWARD FILL

    cov_est_error = cov_data.select(
        pl.struct([(f"cov_est{i}{j}") for i in range(1, 4) for j in range(1, 4)] + [(f"cov_next{i}{j}") for i in range(1, 4) for j in range(1, 4)] + [(f"mu{i}") for i in range(1, 4)]).map_elements(
            lambda row: cov_weights_error(np.array([[row[f"cov_est{i}{j}"] for i in range(1, 4)] for j in range(1, 4)]), np.array([[row[f"cov_next{i}{j}"] for i in range(1, 4)] for j in range(1, 4)]), np.array([row[f"mu{i}"] for i in range(1, 4)])),
            return_dtype=pl.Float64
        ).alias("cov_est_error")
    )
    

    return cov_est_error.select(pl.mean("cov_est_error"))[0, 0]


#np.linalg.solve(np.array([[pl.col(f"est_cov{i}{j}") for i in range(1, 4)] for j in range(1, 4)]), np.array([pl.col(f"exp_returns{i}") for i in range(1, 4)]))

In [11]:
print(f"Cov Error: {cov_est_error(0.95, 0.9)}")

#GET RID OF FILL_NULL IN EXPRS_COV_NEXT!!! 

Cov Error: 16.662558998429976


In [12]:
cov_errors = np.array([[cov_est_error(i/100.0, j/100.0) for j in range(70, 90)] for i in range(70, 90)])
min_args = np.argmin(cov_errors)
print(f"{min_args} : {np.min(cov_errors)}")

233 : 1.900957626132612


In [None]:
r=0.95
b=0.95

error = 0.01 #approximately the contribution of the last day included in the average.
    
num_r = int(np.floor_divide(np.log(error), np.log(r))) #num_rber of days computed in the average.
weights_r = [0] + [(r**i) for i in range(1, num_r)] #weights_r for the rolling sum. Note they are automatially normalized.
    
num_b = int(np.floor_divide(np.log(error), np.log(b))) #num_rber of days computed in the average.
weights_b = [0] + [(b**i) for i in range(1, num_b)] #weights_r for the rolling sum. Note they are automatially normalized.

exprs_cov_construction = [pl.col("date").alias("date")] #Expressions for constructing columns representing covariance matrix coefficients.
for i in range(1, 4):
    for j in range(1, 4):
        exprs_cov_construction.append(
            ((pl.col(f"returns{i}") - pl.col(f"mu{i}")) *
            (pl.col(f"returns{j}") - pl.col(f"mu{j}"))).alias(f"cov{i}{j}")
        )

#exprs_cov_est = [] #Expressions for constructing estimated covariance matrix coefficients.
#for i in range(1, 4):
#    for j in range(1, 4):
#        exprs_cov_est.append(
#            pl.col(f"cov{i}{j}").rolling_mean(num_r, weights_r).fill_null(strategy="backward").alias(f"cov_est{i}{j}")
#        )
    
exprs_cov_next = []
for i in range(1, 4):
    for j in range(1, 4):
        exprs_cov_next.append(
            (pl.col(f"cov{i}{j}").shift(-1)).fill_null(strategy="forward").alias(f"cov_next{i}{j}")
        )

training_data = training_data.with_columns(
    pl.col("returns1").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu1"),
    pl.col("returns2").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu2"),
    pl.col("returns3").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu3"),
).filter(
    pl.col("date").is_between(startDate + dt.timedelta(num_b), endDate)
)

dates = training_data.select(
    pl.col("date").alias("date")
).to_numpy

returns = training_data.select(
    pl.col("returns1").alias("returns1"),
    pl.col("returns2").alias("returns2"),
    pl.col("returns3").alias("returns3")
).to_numpy()

alphas = training_data.select(
    pl.col("mu1").alias("mu1"),
    pl.col("mu2").alias("mu2"),
    pl.col("mu3").alias("mu3"),
).to_numpy()

len = np.shape(alphas)[0]

cov_matrix_real = list(map(lambda x : np.outer(x, x), np.array(returns) - np.array(alphas)))

cov_matrix_est = [cov_matrix_real[0]]
for i in range(1, len):
    cov_matrix_est.append((1-r)*cov_matrix_real[i] + r*cov_matrix_est[i-1])


print(cov_matrix_real)
print("Cov_matrix_est")

[array([[0.00275381, 0.0012174 , 0.00108017],
       [0.0012174 , 0.00053819, 0.00047752],
       [0.00108017, 0.00047752, 0.00042369]]), array([[ 2.59358893e-05,  1.61153670e-04, -3.11281722e-06],
       [ 1.61153670e-04,  1.00133467e-03, -1.93416124e-05],
       [-3.11281722e-06, -1.93416124e-05,  3.73599336e-07]]), array([[ 3.69993529e-07, -1.40339495e-05, -3.25971008e-05],
       [-1.40339495e-05,  5.32311305e-04,  1.23641640e-03],
       [-3.25971008e-05,  1.23641640e-03,  2.87186369e-03]]), array([[2.43568992e-05, 4.60861887e-05, 4.55144370e-06],
       [4.60861887e-05, 8.72006233e-05, 8.61188002e-06],
       [4.55144370e-06, 8.61188002e-06, 8.50503983e-07]]), array([[ 3.10390598e-05,  6.41204670e-05, -5.80012330e-05],
       [ 6.41204670e-05,  1.32460013e-04, -1.19818904e-04],
       [-5.80012330e-05, -1.19818904e-04,  1.08384179e-04]]), array([[ 4.33067376e-04,  8.16472821e-04, -5.79102449e-05],
       [ 8.16472821e-04,  1.53931675e-03, -1.09179642e-04],
       [-5.79102449e-05