In [1]:
import pandas as pd
import numpy as np
import datetime as DT
import pandas as pd
from pypfopt import EfficientFrontier
from google.cloud import bigquery
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Normalization
# from pypfopt import risk_models
# from pypfopt import expected_returns

2024-06-06 21:36:03.002530: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-06 21:36:03.515826: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-06 21:36:05.601255: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os

gcp_project = os.environ['GCP_PROJECT']

In [None]:
# Simplified query for this backtesting exercise

PROJECT = "le-wagon-hedge-fund"
DATASET = "data_alpaca_20240604"
TABLE = "SP500_Historical_Weekly"

query = f"""
    SELECT *
    FROM {PROJECT}.{DATASET}.{TABLE}
    """

client = bigquery.Client(project=gcp_project)
query_job = client.query(query)
result = query_job.result()
df = result.to_dataframe()

In [3]:
def get_all_data():
    # start_date=datetime.strptime(start_date,'%Y-%m-%d')
    # end_date=datetime.strptime(end_date,'%Y-%m-%d')
    PROJECT = "le-wagon-hedge-fund"
    DATASET = "data_alpaca_20240604"
    TABLE = "SP500_Historical_Weekly"
    query = f"""

    SELECT *
    FROM {PROJECT}.{DATASET}.{TABLE}
    """
    client = bigquery.Client(project=gcp_project)
    query_job = client.query(query)
    result = query_job.result()
    df = result.to_dataframe()
    return df

In [4]:
def get_data(start_date, end_date):
    # start_date=datetime.strptime(start_date,'%Y-%m-%d')
    # end_date=datetime.strptime(end_date,'%Y-%m-%d')
    PROJECT = "le-wagon-hedge-fund"
    DATASET = "data_alpaca_20240604"
    TABLE = "SP500_Historical_Weekly"
    query = f"""

    SELECT *
    FROM {PROJECT}.{DATASET}.{TABLE}
    WHERE (DATE(timestamp) BETWEEN '{start_date}' AND '{end_date}')
    """
    client = bigquery.Client(project=gcp_project)
    query_job = client.query(query)
    result = query_job.result()
    df = result.to_dataframe()
    return df

In [5]:
def features_from_data(df):
    time_df = df.pivot(index='timestamp',columns='symbol',values='close')
    returns_df = time_df.pct_change()#.dropna()
    # Removing all stocks that have more than 20 missing observations
    s = returns_df.isna().sum()>20
    to_ban = list(s[s].index)
    returns_df = returns_df[returns_df.columns[~returns_df.columns.isin(to_ban)]]
    # Imputing
    returns_df = returns_df.fillna(returns_df.mean())

    X = returns_df.iloc[:-1]
    y = returns_df.iloc[-1]

    X = X.to_numpy().reshape(X.shape[1],X.shape[0],1)
    y = y.to_numpy()

    X_pred = returns_df.to_numpy()
    X_pred = X_pred.reshape(X_pred.shape[1],X_pred.shape[0],1)

    return X, y, X_pred, returns_df.cov(), list(returns_df.columns)

In [9]:
def initialize_model_LSTM(X):
    # 1- RNN Architecture
    normalizer = Normalization()
    normalizer.adapt(X)
    model = Sequential()
    model.add(normalizer)
    model.add(layers.LSTM(units=20, activation='tanh'))
    model.add(layers.Dense(10, activation="relu"))
    model.add(layers.Dense(1, activation="linear"))

    # 2- Compilation
    model.compile(loss='mse',
                optimizer='rmsprop',
                metrics=['mae']) # very high lr so we can converge with such a small dataset

    return model

In [10]:
def fitting_model(X,y):
    model = initialize_model_LSTM(X)
    es = EarlyStopping(patience=5, restore_best_weights=True)
    history = model.fit(X, y.reshape(-1,), validation_split=.2, batch_size=32, epochs=20, verbose=1, callbacks=[es])
    return model

In [11]:
def predicting(X, model):
    y_pred = model.predict(X)
    return y_pred

In [12]:
def covariance(df):
    return df.cov()

In [7]:
def model_try(df):
    time_df = df.pivot(index='timestamp',columns='symbol',values='close')
    returns_df = time_df.pct_change().dropna()
    expected_returns=pd.DataFrame(returns_df.mean(), columns=['expected_return'])
    cov_df=returns_df.cov()
    return expected_returns, cov_df

In [75]:
def making_portfolio(tickers,expected_returns, cov_df):
    ef = EfficientFrontier(expected_returns,cov_df, solver='ECOS') #Had to change the solver to ECOS as the other wouldn't work. Look into this.
    ef.tickers = tickers
    raw_weights = ef.max_sharpe(risk_free_rate=0.001)
    cleaned_weights = ef.clean_weights()
    return pd.DataFrame(list(cleaned_weights.items()), columns=['ticker','weight']).set_index('ticker')

In [14]:
def portfolio_returns(weights: pd.DataFrame, start_date: str, end_date: str):
    # Finding the returns for all stocks between start and end date
    df=get_data(start_date,end_date)
    time_df = df.pivot(index='timestamp',columns='symbol',values='close')

    #Resetting index for the time_df
    time_df['clean_date']=time_df.index
    time_df['clean_date']=time_df['clean_date'].apply(lambda x: DT.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00'))\
        .apply(lambda x: f'{x.year}-{x.month:02d}-{x.day:02d}')
    time_df = time_df.set_index('clean_date')

    ret = time_df.loc[f'{end_date}']/time_df.loc[f'{start_date}']-1

    # Calculating portfolio return
    port_return = (weights.weight * ret).sum()

    return port_return


In [1]:
#TODO: Make this code more efficient by not querying every time but rather saving data locally while running

def backtesting(as_of_date, n_periods, period_type='W'):
    as_of = DT.datetime.strptime(as_of_date, '%Y-%m-%d').date()
    starting_point = as_of - DT.timedelta(days=7 * n_periods)
    starting_point_str = f'{starting_point.year}-{starting_point.month:02d}-{starting_point.day:02d}'
    port_return = 1
    weekly_returns = []

    # Training the model with data until the starting point

    df = get_data('2016-01-04',as_of_date)
    df = df[df.timestamp.apply(lambda x: DT.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00').date())<starting_point]

    X, y, X_pred, cov_df, tickers = features_from_data(df)
    model = fitting_model(X,y)

    # Calculating portfolio returns
    while starting_point < as_of:
        one_week_ahead = starting_point + DT.timedelta(days=7)
        week_start_str = f'{starting_point.year}-{starting_point.month:02d}-{starting_point.day:02d}'
        week_end_str = f'{one_week_ahead.year}-{one_week_ahead.month:02d}-{one_week_ahead.day:02d}'

        df = df[df.timestamp.apply(lambda x: DT.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00').date())<starting_point]
        X, y, X_pred, cov_df, tickers = features_from_data(df)
        y_pred = predicting(X_pred, model)
        cleaned_weights = making_portfolio(tickers,y_pred.reshape(-1), cov_df)
        weekly_return = portfolio_returns(cleaned_weights,week_start_str,week_end_str)
        weekly_returns.append(weekly_return)
        port_return *= (1+weekly_return)
        starting_point += DT.timedelta(days=7)

    port_return -= 1

    return port_return, weekly_returns, cleaned_weights

In [74]:
backtesting('2024-05-27',4)

  returns_df = time_df.pct_change()#.dropna()


Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 347ms/step - loss: 0.0020 - mae: 0.0313 - val_loss: 0.0015 - val_mae: 0.0265
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 276ms/step - loss: 0.0016 - mae: 0.0273 - val_loss: 0.0013 - val_mae: 0.0259
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 280ms/step - loss: 0.0015 - mae: 0.0264 - val_loss: 0.0014 - val_mae: 0.0261
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 286ms/step - loss: 0.0014 - mae: 0.0261 - val_loss: 0.0015 - val_mae: 0.0258
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 283ms/step - loss: 0.0014 - mae: 0.0253 - val_loss: 0.0014 - val_mae: 0.0263
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 275ms/step - loss: 0.0015 - mae: 0.0273 - val_loss: 0.0014 - val_mae: 0.0255
Epoch 7/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 289ms/step - 

  returns_df = time_df.pct_change()#.dropna()


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 158ms/step
                                     CVXPY                                     
                                     v1.5.1                                    
(CVXPY) Jun 06 10:23:54 PM: Your problem has 492 variables, 985 constraints, and 0 parameters.
(CVXPY) Jun 06 10:24:05 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jun 06 10:24:05 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jun 06 10:24:05 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Jun 06 10:24:05 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
--------------------------------------------------------------------------

    You specified your problem should be solved by ECOS. Starting in
    CXVPY 1.6.0, ECOS will no longer be installed by default with CVXPY.
    Please either add an explicit dependency on ECOS or switch to our new
    default solver, Clarabel, by either not specifying a solver argument
    or specifying ``solver=cp.CLARABEL``.
    


(CVXPY) Jun 06 10:24:06 PM: Applying reduction ECOS
(CVXPY) Jun 06 10:24:06 PM: Finished problem compilation (took 3.107e-01 seconds).
-------------------------------------------------------------------------------
                                Numerical solver                               
-------------------------------------------------------------------------------
(CVXPY) Jun 06 10:24:06 PM: Invoking solver ECOS  to obtain a solution.

ECOS 2.0.10 - (C) embotech GmbH, Zurich Switzerland, 2012-15. Web: www.embotech.com/ECOS

It     pcost       dcost      gap   pres   dres    k/t    mu     step   sigma     IR    |   BT
 0  +0.000e+00  -2.161e-01  +5e+03  9e-01  5e+00  1e+00  5e+00    ---    ---    1  1  - |  -  - 
 1  +3.924e+01  +4.077e+01  +9e+02  4e-01  2e+00  2e+00  9e-01  0.9890  2e-01   1  1  1 |  0  0
 2  +1.827e+02  +1.868e+02  +1e+02  2e-01  1e+00  4e+00  1e-01  0.8671  1e-02   1  1  1 |  0  0
 3  +2.368e+02  +2.426e+02  +6e+01  1e-01  1e+00  6e+00  6e-02  0.6362  1e-01 

  returns_df = time_df.pct_change()#.dropna()


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 89ms/step
                                     CVXPY                                     
                                     v1.5.1                                    
(CVXPY) Jun 06 10:24:20 PM: Your problem has 492 variables, 985 constraints, and 0 parameters.
(CVXPY) Jun 06 10:24:32 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jun 06 10:24:32 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jun 06 10:24:32 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Jun 06 10:24:32 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
---------------------------------------------------------------------------

    You specified your problem should be solved by ECOS. Starting in
    CXVPY 1.6.0, ECOS will no longer be installed by default with CVXPY.
    Please either add an explicit dependency on ECOS or switch to our new
    default solver, Clarabel, by either not specifying a solver argument
    or specifying ``solver=cp.CLARABEL``.
    


(CVXPY) Jun 06 10:24:33 PM: Applying reduction CvxAttr2Constr
(CVXPY) Jun 06 10:24:33 PM: Applying reduction ConeMatrixStuffing
(CVXPY) Jun 06 10:24:33 PM: Applying reduction ECOS
(CVXPY) Jun 06 10:24:33 PM: Finished problem compilation (took 7.956e-01 seconds).
-------------------------------------------------------------------------------
                                Numerical solver                               
-------------------------------------------------------------------------------
(CVXPY) Jun 06 10:24:33 PM: Invoking solver ECOS  to obtain a solution.

ECOS 2.0.10 - (C) embotech GmbH, Zurich Switzerland, 2012-15. Web: www.embotech.com/ECOS

It     pcost       dcost      gap   pres   dres    k/t    mu     step   sigma     IR    |   BT
 0  +0.000e+00  -2.161e-01  +5e+03  9e-01  5e+00  1e+00  5e+00    ---    ---    1  1  - |  -  - 
 1  +3.924e+01  +4.077e+01  +9e+02  4e-01  2e+00  2e+00  9e-01  0.9890  2e-01   1  1  1 |  0  0
 2  +1.827e+02  +1.868e+02  +1e+02  2e-01  1e+

  returns_df = time_df.pct_change()#.dropna()


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 84ms/step
                                     CVXPY                                     
                                     v1.5.1                                    
(CVXPY) Jun 06 10:24:49 PM: Your problem has 492 variables, 985 constraints, and 0 parameters.
(CVXPY) Jun 06 10:25:01 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jun 06 10:25:01 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jun 06 10:25:01 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Jun 06 10:25:01 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
---------------------------------------------------------------------------

    You specified your problem should be solved by ECOS. Starting in
    CXVPY 1.6.0, ECOS will no longer be installed by default with CVXPY.
    Please either add an explicit dependency on ECOS or switch to our new
    default solver, Clarabel, by either not specifying a solver argument
    or specifying ``solver=cp.CLARABEL``.
    


(CVXPY) Jun 06 10:25:02 PM: Applying reduction CvxAttr2Constr
(CVXPY) Jun 06 10:25:02 PM: Applying reduction ConeMatrixStuffing
(CVXPY) Jun 06 10:25:02 PM: Applying reduction ECOS
(CVXPY) Jun 06 10:25:02 PM: Finished problem compilation (took 1.280e+00 seconds).
-------------------------------------------------------------------------------
                                Numerical solver                               
-------------------------------------------------------------------------------
(CVXPY) Jun 06 10:25:02 PM: Invoking solver ECOS  to obtain a solution.

ECOS 2.0.10 - (C) embotech GmbH, Zurich Switzerland, 2012-15. Web: www.embotech.com/ECOS

It     pcost       dcost      gap   pres   dres    k/t    mu     step   sigma     IR    |   BT
 0  +0.000e+00  -2.161e-01  +5e+03  9e-01  5e+00  1e+00  5e+00    ---    ---    1  1  - |  -  - 
 1  +3.924e+01  +4.077e+01  +9e+02  4e-01  2e+00  2e+00  9e-01  0.9890  2e-01   1  1  1 |  0  0
 2  +1.827e+02  +1.868e+02  +1e+02  2e-01  1e+

  returns_df = time_df.pct_change()#.dropna()


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 82ms/step
                                     CVXPY                                     
                                     v1.5.1                                    
(CVXPY) Jun 06 10:25:14 PM: Your problem has 492 variables, 985 constraints, and 0 parameters.
(CVXPY) Jun 06 10:25:24 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jun 06 10:25:24 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jun 06 10:25:24 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Jun 06 10:25:24 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
---------------------------------------------------------------------------

    You specified your problem should be solved by ECOS. Starting in
    CXVPY 1.6.0, ECOS will no longer be installed by default with CVXPY.
    Please either add an explicit dependency on ECOS or switch to our new
    default solver, Clarabel, by either not specifying a solver argument
    or specifying ``solver=cp.CLARABEL``.
    


(CVXPY) Jun 06 10:25:25 PM: Applying reduction ECOS
(CVXPY) Jun 06 10:25:25 PM: Finished problem compilation (took 3.640e-01 seconds).
-------------------------------------------------------------------------------
                                Numerical solver                               
-------------------------------------------------------------------------------
(CVXPY) Jun 06 10:25:25 PM: Invoking solver ECOS  to obtain a solution.

ECOS 2.0.10 - (C) embotech GmbH, Zurich Switzerland, 2012-15. Web: www.embotech.com/ECOS

It     pcost       dcost      gap   pres   dres    k/t    mu     step   sigma     IR    |   BT
 0  +0.000e+00  -2.161e-01  +5e+03  9e-01  5e+00  1e+00  5e+00    ---    ---    1  1  - |  -  - 
 1  +3.924e+01  +4.077e+01  +9e+02  4e-01  2e+00  2e+00  9e-01  0.9890  2e-01   1  1  1 |  0  0
 2  +1.827e+02  +1.868e+02  +1e+02  2e-01  1e+00  4e+00  1e-01  0.8671  1e-02   1  1  1 |  0  0
 3  +2.368e+02  +2.426e+02  +6e+01  1e-01  1e+00  6e+00  6e-02  0.6362  1e-01 

(0.9987784409824042,
 [0.003278842882865191,
  0.0008712730403988974,
  -0.006253484284270804,
  0.0009068515290405028])