In [1]:
import sf_quant as sf
import polars as pl
import datetime as dt
import numpy as np
import scipy as sp



In [2]:
start = dt.date(1996, 1, 1)
end = dt.date(2024, 12, 31)

columns = [
    'date',
    'barrid',
    'price',
    'return',
    'specific_return',
]

df = pl.read_parquet("russell_3000_daily.parquet")

df.write_parquet("russell_3000_daily.parquet")

In [3]:
#Helper program to find stocks that have returns over the whole range of dates. I need to simulate the signals somehow.

startDate = dt.date(2013, 1, 1) 
endDate = dt.date(2018, 1, 1)

unique_barrids = df.filter(
    pl.col("date").is_between(startDate, endDate)
).select(pl.col("barrid")).unique()

barrid = unique_barrids.sample(1)[0, 0]
print(barrid)

print(df.filter(
    pl.col("date").is_between(startDate, endDate),
    (pl.col("barrid") == barrid),
).select(
    pl.col("date").alias("date"),
    pl.col("return").alias("return")
))


USA97X1
shape: (632, 2)
┌────────────┬─────────┐
│ date       ┆ return  │
│ ---        ┆ ---     │
│ date       ┆ f64     │
╞════════════╪═════════╡
│ 2015-06-30 ┆ -1.1917 │
│ 2015-07-01 ┆ 3.4171  │
│ 2015-07-02 ┆ -0.3887 │
│ 2015-07-06 ┆ 2.0488  │
│ 2015-07-07 ┆ 0.956   │
│ …          ┆ …       │
│ 2017-12-22 ┆ -0.9314 │
│ 2017-12-26 ┆ 0.5443  │
│ 2017-12-27 ┆ 0.5413  │
│ 2017-12-28 ┆ 0.2447  │
│ 2017-12-29 ┆ -1.123  │
└────────────┴─────────┘


In [4]:
barrid1 = "USAQ392"
barrid2 = "USAZ6Q1"
barrid3 = "USAROU1"


training_data = df.filter(
    pl.col("date").is_between(startDate, endDate),
    (pl.col("barrid") == barrid1) | (pl.col("barrid") == barrid2) | (pl.col("barrid") == barrid3)
).pivot("barrid", index="date", values="return").select(
    pl.col("date"),
    (pl.col(barrid1)/100).alias("returns1"),
    (pl.col(barrid2)/100).alias("returns2"),
    (pl.col(barrid3)/100).alias("returns3")
)

print(training_data)

#cum_returns = trainingdata.select(pl.col("date"), np.log(pl.col("return") + 1).cum_sum().alias("cumulative_returns"))

#print(cum_returns.select(pl.col("date"), (np.exp(pl.col("cumulative_returns"))-1).alias("Cumulative_Returns")))


shape: (1_259, 4)
┌────────────┬───────────┬───────────┬───────────┐
│ date       ┆ returns1  ┆ returns2  ┆ returns3  │
│ ---        ┆ ---       ┆ ---       ┆ ---       │
│ date       ┆ f64       ┆ f64       ┆ f64       │
╞════════════╪═══════════╪═══════════╪═══════════╡
│ 2013-01-02 ┆ 0.005376  ┆ 0.022719  ┆ 0.014524  │
│ 2013-01-03 ┆ 0.032086  ┆ -0.008599 ┆ -0.003937 │
│ 2013-01-04 ┆ 0.036269  ┆ 0.010481  ┆ -0.017248 │
│ 2013-01-07 ┆ -0.0002   ┆ 0.010372  ┆ 0.001097  │
│ 2013-01-08 ┆ -0.002801 ┆ 0.007788  ┆ 0.011687  │
│ …          ┆ …         ┆ …         ┆ …         │
│ 2017-12-22 ┆ -0.033333 ┆ 0.000254  ┆ 0.007785  │
│ 2017-12-26 ┆ 0.0       ┆ 0.002284  ┆ 0.002107  │
│ 2017-12-27 ┆ 0.0       ┆ -0.000506 ┆ -0.001402 │
│ 2017-12-28 ┆ -0.002463 ┆ -0.003799 ┆ 0.009123  │
│ 2017-12-29 ┆ -0.001235 ┆ -0.005594 ┆ 0.004172  │
└────────────┴───────────┴───────────┴───────────┘


In [47]:
def cov_error(cov_est, cov_real, exp_returns):
    # cov_est and cov_true are 3x3 lists, mu is vector
    #cov_est = np.array(cov_est, dtype=float)
    #cov_real = np.array(cov_real, dtype=float)
    #exp_returns = np.array(exp_returns, dtype=float)
    if (np.linalg.det(cov_est) == 0): return 0
    if (np.linalg.det(cov_real) == 0): return 0

    x_est = np.linalg.solve(cov_est, exp_returns)
    x_real = np.linalg.solve(cov_real, exp_returns)
    return np.linalg.norm((x_est / np.sum(x_est)) - (x_real / np.sum(x_real)))


In [88]:

def cov_est_error(r, b): #ratio for geometric weighting.
    error = 0.01 #approximately the contribution of the last day included in the average.
    
    num_r = int(np.floor_divide(np.log(error), np.log(r))) #num_rber of days computed in the average.
    weights_r = [(r**i) for i in range(1, num_r + 1)] #weights_r for the rolling sum. Note they are automatially normalized.
    
    num_b = int(np.floor_divide(np.log(error), np.log(b))) #num_rber of days computed in the average.
    weights_b = [(b**i) for i in range(1, num_b + 1)] #weights_r for the rolling sum. Note they are automatially normalized.

    exprs_cov_construction = [pl.col("date").alias("date")] #Expressions for constructing columns representing covariance matrix coefficients.
    for i in range(1, 4):
        for j in range(1, 4):
            exprs_cov_construction.append(
                ((pl.col(f"returns{i}") - pl.col(f"mu{i}")) *
                (pl.col(f"returns{j}") - pl.col(f"mu{j}"))).alias(f"cov{i}{j}")
            )

    exprs_cov_est = [] #Expressions for constructing estimated covariance matrix coefficients.
    for i in range(1, 4):
        for j in range(1, 4):
            exprs_cov_est.append(
                pl.col(f"cov{i}{j}").rolling_mean(num_r, weights_r).fill_null(strategy="backward").alias(f"cov_est{i}{j}")
            )
    
    exprs_cov_next = []
    for i in range(1, 4):
        for j in range(1, 4):
            exprs_cov_next.append(
                (pl.col(f"cov{i}{j}").shift(-1)).fill_null(strategy="forward").alias(f"cov_next{i}{j}")
            )

    cov_data = training_data.with_columns(
        pl.col("returns1").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu1"),
        pl.col("returns2").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu2"),
        pl.col("returns3").rolling_mean(num_b, weights_b).fill_null(strategy="backward").alias("mu3")
    ).with_columns( #Need the mu's to compute the covariance matrices.
        exprs_cov_construction
    ).with_columns( #Need the covariance matrix coefficients to compute the estimated future covariance matrix.
        exprs_cov_est
    ).with_columns(
        exprs_cov_next
    ).filter(
        pl.col("date").is_between(startDate + dt.timedelta(num_r + num_b), endDate)
        #rolling_mean introduces a null in a row when there are too many weights_r for the num_rber of available elements. 
        #this is just filtering out backfilled nulls from two rolling_mean steps.
    )

    #print(cov_data.select(pl.col("cov_next11"))) #USE THIS TO SEE NULL WHEN EXPRS_COV_NEXT HAS NO FORWARD FILL

    cov_est_error = cov_data.select(
        pl.struct([(f"cov_est{i}{j}") for i in range(1, 4) for j in range(1, 4)] + [(f"cov_next{i}{j}") for i in range(1, 4) for j in range(1, 4)] + [(f"mu{i}") for i in range(1, 4)]).map_elements(
            lambda row: cov_error(np.array([[row[f"cov_est{i}{j}"] for i in range(1, 4)] for j in range(1, 4)]), np.array([[row[f"cov_next{i}{j}"] for i in range(1, 4)] for j in range(1, 4)]), np.array([row[f"mu{i}"] for i in range(1, 4)])),
            return_dtype=pl.Float64
        ).alias("cov_est_error")
    )
    

    return cov_est_error.select(pl.mean("cov_est_error"))[0, 0]


#np.linalg.solve(np.array([[pl.col(f"est_cov{i}{j}") for i in range(1, 4)] for j in range(1, 4)]), np.array([pl.col(f"exp_returns{i}") for i in range(1, 4)]))

In [89]:
print(f"Cov Error: {cov_est_error(0.95, 0.9)}")

#GET RID OF FILL_NULL IN EXPRS_COV_NEXT!!! 

Cov Error: 16.662558998429976


In [None]:
cov_errors = np.array([[cov_est_error(i/100.0, j/100.0) for j in range(70, 90)] for i in range(70, 90)])
min_args = np.argmin(cov_errors)
print(f"{min_args} : {np.min(cov_errors)}")

233 : 1.900957626132612


In [97]:
reader = np.array([[[i/100.0, j/100.0] for j in range (70, 90)] for i in range(70, 90)])
print(reader[min_args])

IndexError: index 233 is out of bounds for axis 0 with size 20