In [1]:
import pandas as pd
import bisect
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, SGDRegressor
df = pd.read_csv("df_ff_factors_010225.csv")
df = df.drop(columns=["Unnamed: 0","crsp_portno"])
df = df.sort_values(by='date')
df.columns

Index(['crsp_fundno', 'date', 'mth_return', 'exp_ratio', 'turn_ratio',
       'normalised_flow', 'gdp_to_debt_ratio', 'gdp_growth_rate', 'unm_rate',
       'infl_rate', 'mktrf', 'smb', 'hml', 'rmw', 'cma', 'rf', 'umd',
       'excess_return', 'rolling_sharpe', 'mkt_return', 'rolling_alpha_3f',
       'rolling_alpha_4f', 'rolling_alpha_5f', 'shortrun_momentum'],
      dtype='object')

In [2]:
# Generate Lagged Dataset
def create_lagged_dataset(dataset, lag, target_var, id):
    lagged_dataset = dataset.copy()
    columns_list = list(lagged_dataset.columns)
    data_join = {}
    for column in columns_list:
        if column == target_var:
            data_join[column] = lagged_dataset[column]
        for n in range(1,lag+1):
            data_join[F'{column}_L{n}'] = lagged_dataset.groupby(id)[column].shift(n)
    lagged_dataset = pd.concat(data_join.values(), axis=1, ignore_index = True)
    lagged_dataset.columns = data_join.keys()
    return lagged_dataset.dropna()

# Generate Stepped Dataset for Training
## Steps is the number of months ahead that we are forecasting, e.g. step=2 is 2 months ahead.
## Note step=1 results in no change to dataset, i.e. use generated lagged variables to forecast current. 
def create_stepped_dataset(dataset, step, target_var, id):
    
    shifted_dataset = dataset.copy()
    # y = shifted_dataset[[target_var]].shift(-step+1)
    # if step != 1:
    #     X = shifted_dataset.iloc[:-step+1, :] # remove the last few rows
    # else:
    #     X = shifted_dataset
    # return X.drop(target_var, axis = 1), y.dropna()
    shifted_dataset['shifted_target'] = shifted_dataset.groupby(id)[target_var].shift(-step + 1)
    
    # Drop rows where the shifted target is NaN (these occur due to the shift operation)
    shifted_dataset = shifted_dataset.dropna(subset=['shifted_target'])
    
    # Separate the features (X) and the target (y)
    X = shifted_dataset.drop(columns=[target_var, 'shifted_target'])
    y = shifted_dataset[['shifted_target']]
    y = y.rename(columns={'shifted_target':target_var})
    return X, y

x = create_lagged_dataset(df, lag = 1, target_var='rolling_sharpe', id = 'crsp_fundno')
test_step = create_stepped_dataset(x, step=1, target_var='rolling_sharpe', id = 'crsp_fundno_L1')[1]
test_step

Unnamed: 0,rolling_sharpe
43632,2.216452
43633,3.058133
13748,-0.536935
43634,1.604677
13749,0.429084
...,...
3031,1.619481
1960,0.691406
2113,0.691738
1842,0.676394


In [3]:
import numpy as np
corr = np.corrcoef(df['rolling_alpha_5f'], df['rolling_sharpe'])
corr # 0.204 --> can use!

array([[1.        , 0.16425066],
       [0.16425066, 1.        ]])

3-Factor Model: mktrf, SMB, HML
4-Factor Model: mktrf, SMB, HML, UMD
5-Factor Model: mktrf, SMB, HML, RMW, CMA

In [6]:
df_3_factor = df.drop(columns=['mkt_return','umd','cma','rmw','mth_return','rf'])
df_3_factor = df[['date','mth_return','exp_ratio','turn_ratio','mktrf', 'smb', 'hml']]
df_4_factor = df.drop(columns=['mkt_return','cma','rmw','mth_return','rf'])
df_5_factor = df.drop(columns=['mkt_return','umd','mth_return','rf'])
df_4_factor

Unnamed: 0,crsp_fundno,date,exp_ratio,turn_ratio,normalised_flow,gdp_to_debt_ratio,gdp_growth_rate,unm_rate,infl_rate,mktrf,smb,hml,umd,excess_return,rolling_sharpe,rolling_alpha_3f,rolling_alpha_4f,rolling_alpha_5f,shortrun_momentum
43631,32553.0,1993-08-31,0.0162,0.15,0.009529,64.101,3.5225,6.8,2.90,0.0371,0.0010,0.0013,0.0265,0.068910,3.248937,0.068910,0.068910,0.068910,0.012866
43632,32553.0,1993-09-30,0.0162,0.15,0.014881,64.101,3.5225,6.7,2.90,-0.0012,0.0298,-0.0031,0.0337,-0.004108,2.216452,-0.004108,-0.004108,-0.004108,-0.009113
43633,32553.0,1993-10-29,0.0162,0.15,0.030641,64.669,3.5225,6.8,2.90,0.0141,0.0189,-0.0276,-0.0272,0.040962,3.058133,0.040962,0.040962,0.040962,0.000383
13747,12051.0,1993-10-29,0.0188,0.77,0.070030,64.669,3.5225,6.8,2.90,0.0141,0.0189,-0.0276,-0.0272,0.034690,5.744340,0.034690,0.034690,0.034690,0.013103
13748,12051.0,1993-11-30,0.0188,0.77,0.000259,64.669,3.5225,6.6,2.90,-0.0189,-0.0127,-0.0074,-0.0474,-0.075999,-0.536935,-0.075999,-0.075999,-0.075999,-0.026033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3031,4610.0,2024-06-28,0.0217,0.23,-0.650182,120.040,2.5427,4.1,3.35,0.0070,-0.0568,-0.0247,0.0508,0.078518,1.619481,0.001428,0.002329,0.003201,-0.025985
1960,4330.0,2024-07-31,0.0077,0.31,-0.022800,120.731,2.5427,4.2,3.35,0.0070,-0.0568,-0.0247,0.0508,0.028784,0.691406,0.004765,0.005643,0.010978,-0.024241
2113,4333.0,2024-07-31,0.0076,0.31,-0.402964,120.731,2.5427,4.2,3.35,0.0070,-0.0568,-0.0247,0.0508,0.028779,0.691738,0.004733,0.005601,0.010953,-0.024228
1842,4327.0,2024-07-31,0.0102,0.31,-0.032107,120.731,2.5427,4.2,3.35,0.0070,-0.0568,-0.0247,0.0508,0.028569,0.676394,0.004506,0.005384,0.010697,-0.024451


In [4]:
def create_naive(dataset, step, target_var):
    naive = dataset.copy()
    naive[target_var + '_naive'] = naive.groupby('crsp_fundno')[target_var].shift(step)
    return naive

# Generates next date
def generate_next_date(list_of_dates, date):
    return list_of_dates[list_of_dates > date].min()


def process_factor_model(X_factor, y_factor, train_end, test_date):
    X_train = X_factor[X_factor['date_L1'] <= train_end].drop(columns='date_L1')
    X_test = X_factor[X_factor['date_L1'] == test_date].drop(columns='date_L1')

    y_train = y_factor.loc[X_train.index]
    y_test = y_factor.loc[X_test.index]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

# create_naive(df, step = 1, target_var='rolling_sharpe')

In [8]:
# Training Cycle
def overall_function(dataset, outcome):
    dataset['date'] = pd.to_datetime(dataset['date']) # converting to date format
    dataset = dataset.sort_values(by='date')
    # dataset = dataset.replace(np.inf, 0)
    # dataset = dataset.replace(-np.inf, 0)

    # return(dataset_naive)
    # Factor Models
    df_3_factor = dataset.drop(columns=['mkt_return','umd','cma','rmw','mth_return','rf','rolling_alpha_4f','rolling_alpha_5f'])
    df_4_factor = dataset.drop(columns=['mkt_return','cma','rmw','mth_return','rf','rolling_alpha_3f','rolling_alpha_5f'])
    df_5_factor = dataset.drop(columns=['mkt_return','umd','mth_return','rf','rolling_alpha_3f','rolling_alpha_4f'])
    
    # Creating Lagged and Stepped Datasets
    X_dataset_3f, y_dataset_3f = create_stepped_dataset(create_lagged_dataset(df_3_factor, lag=1,target_var=outcome, id = 'crsp_fundno'),step=1,target_var=outcome, id = 'crsp_fundno_L1')
    X_dataset_4f, y_dataset_4f = create_stepped_dataset(create_lagged_dataset(df_4_factor, lag=1,target_var=outcome, id = 'crsp_fundno'),step=1,target_var=outcome, id = 'crsp_fundno_L1')
    X_dataset_5f, y_dataset_5f = create_stepped_dataset(create_lagged_dataset(df_5_factor, lag=1,target_var=outcome, id = 'crsp_fundno'),step=1,target_var=outcome, id = 'crsp_fundno_L1')
    # return(y_dataset_3f)
    list_of_dates = pd.to_datetime(X_dataset_3f['date_L1'])
    percentile_70 = list_of_dates.quantile(0.7)
    train_end = list_of_dates.loc[(list_of_dates - percentile_70).abs().idxmin()]
    df_end = list_of_dates.max()
    X_dataset_3f = X_dataset_3f.drop(columns='crsp_fundno_L1')
    X_dataset_4f = X_dataset_4f.drop(columns='crsp_fundno_L1')
    X_dataset_5f = X_dataset_5f.drop(columns='crsp_fundno_L1')
    
    results = []
    
    while train_end != df_end:
        
        test_date = generate_next_date(list_of_dates, train_end)
        if pd.isna(test_date):
            break 

        # Process data for modeling
        X_train_3f, X_test_3f, y_train_3f, y_test_3f = process_factor_model(X_dataset_3f, y_dataset_3f, train_end, test_date)
        X_train_4f, X_test_4f, y_train_4f, y_test_4f = process_factor_model(X_dataset_4f, y_dataset_4f, train_end, test_date)
        X_train_5f, X_test_5f, y_train_5f, y_test_5f = process_factor_model(X_dataset_5f, y_dataset_5f, train_end, test_date)

        # Model 1: Naive Model
        X_naive = X_dataset_3f[X_dataset_3f['date_L1'] == test_date][['rolling_sharpe_L1']]
        
        print('Naive Fitted')
        
        # Adding Naive Results
        df_in_loop = y_test_3f.copy()
        df_in_loop['naive'] = X_naive.values
        
        # Model 2: Linear Regression
        linear = LinearRegression()
        linear.fit(X_train_3f, y_train_3f.values.ravel())
        linear3fpred = linear.predict(X_test_3f)
        linear.fit(X_train_4f, y_train_4f.values.ravel())
        linear4fpred = linear.predict(X_test_4f)
        linear.fit(X_train_5f, y_train_5f.values.ravel())
        linear5fpred = linear.predict(X_test_5f)
        
        print('Linear Regression 3-Factor Fitted')
        print('Linear Regression 4-Factor Fitted')
        print('Linear Regression 5-Factor Fitted')

        ## adding
        df_in_loop['3factlinear'] = linear3fpred
        df_in_loop['4factlinear'] = linear4fpred
        df_in_loop['5factlinear'] = linear5fpred

        # # Model 3: Feedforward Neural Network
        model = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=200, random_state=42)
        
        model.fit(X_train_3f, y_train_3f.values.ravel())
        print('FFNN 3-Factor Fitted')
        y_pred_3f = model.predict(X_test_3f)
        
        model.fit(X_train_4f, y_train_4f.values.ravel())
        print('FFNN 4-Factor Fitted')
        y_pred_4f = model.predict(X_test_4f)
        
        model.fit(X_train_5f, y_train_5f.values.ravel())
        print('FFNN 5-Factor Fitted')
        y_pred_5f = model.predict(X_test_5f)

        # Adding FFNN results
        df_in_loop['3f_ffnn'] = y_pred_3f
        df_in_loop['4f_ffnn'] = y_pred_4f
        df_in_loop['5f_ffnn'] = y_pred_5f
        
        # Model 4: Basic RNN
        
        # Model 5: Random Forest Regression
        
        # Add results into loop
        results.append(df_in_loop)
        train_end = test_date
        num_remaining_dates = len(list(set(date for date in list_of_dates if date > test_date)))
        print(f'{num_remaining_dates} dates remaining')
    
    combined_df = pd.concat(results, ignore_index=True)
    return combined_df

In [24]:
embedding_dim = 10
import torch
import pandas as pd
# num_funds = len(df['crsp_fundno'].unique())
df['fund_index'], unique_funds = pd.factorize(df['crsp_fundno'])
num_funds = len(unique_funds)  # Total unique funds
embedding_dim = 10  # Example embedding size
fund_embedding = torch.nn.Embedding(num_funds, embedding_dim)

# Step 3: Convert fund indices to tensor
fund_ids_tensor = torch.tensor(df['fund_index'].values, dtype=torch.long)
embeddings = fund_embedding(fund_ids_tensor).detach().numpy()
# print(embeddings.shape)  # (6, embedding_dim)
# embeddings[[1]]

In [61]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

df['fund_index'], unique_funds = pd.factorize(df['crsp_fundno'])
num_funds = len(unique_funds)  # Number of unique funds

df_features = df.drop(columns=['crsp_fundno','date','rolling_sharpe','fund_index','rolling_alpha_3f','rolling_alpha_4f'])
# Step 2: Convert Data to Tensors
fund_ids_tensor = torch.tensor(df['fund_index'].values, dtype=torch.long)  # Fund IDs
features_tensor = torch.tensor(df_features.values, dtype=torch.float32)  # Covariates
targets_tensor = torch.tensor(df['rolling_sharpe'].values, dtype=torch.float32).unsqueeze(1)
features_tensor = torch.nan_to_num(features_tensor, nan=0.0, posinf=0.0, neginf=0.0)

class FundNN(nn.Module):
    def __init__(self, num_funds, embedding_dim, input_dim):
        super(FundNN, self).__init__()
        self.embedding = nn.Embedding(num_funds, embedding_dim)  # Fund Embedding Layer
        self.fc1 = nn.Linear(embedding_dim + input_dim, 64)  # Hidden Layer 1
        self.fc2 = nn.Linear(64, 32)  # Hidden Layer 2
        self.fc3 = nn.Linear(32, 1)  # Output Layer (Regression)
        self.relu = nn.ReLU()
        
    def forward(self, fund_id, features):
        embedded_fund = self.embedding(fund_id)  # Convert fund ID to embedding
        x = torch.cat([embedded_fund, features], dim=1)  # Concatenate embedding with other features
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # Output regression prediction
        return x

embedding_dim = 10  # Fund embedding size
input_dim = features_tensor.shape[1]  # Number of additional covariates (e.g., 2)
print(input_dim)
model = FundNN(num_funds, embedding_dim, input_dim)

criterion = nn.MSELoss()  # Loss function for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)
# optimizer = optim.SGD(model.parameters(), lr=0.005)

num_epochs = 500
for epoch in range(num_epochs):
    optimizer.zero_grad()
    predictions = model(fund_ids_tensor, features_tensor)  # Forward pass
    loss = criterion(predictions, targets_tensor)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# epochs_list = [300, 400, 500]
# lr_list = [0.001, 0.0025, 0.0075, 0.01]

# # Store the results of MSE for each combination of epochs and learning rates
# best_mse = float('inf')
# best_epochs = None
# best_lr = None

# for num_epochs in epochs_list:
#     for lr in lr_list:
#         # Initialize model and optimizer
#         model = FundNN(num_funds, embedding_dim, input_dim)  # Ensure model is re-initialized
#         optimizer = optim.Adam(model.parameters(), lr=lr)
        
#         # Training loop for the current combination of epochs and learning rate
#         for epoch in range(num_epochs):
#             optimizer.zero_grad()
#             predictions = model(fund_ids_tensor, features_tensor)  # Forward pass
#             loss = criterion(predictions, targets_tensor)  # Compute loss
#             loss.backward()  # Backpropagation
#             optimizer.step()  # Update weights
            
#             if (epoch + 1) % 10 == 0:
#                 print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
#         # After training, calculate the MSE on the validation or test set
#         with torch.no_grad():
#             predictions = model(fund_ids_tensor, features_tensor)
#             mse = mean_squared_error(targets_tensor.numpy(), predictions.numpy())  # Compute MSE
            
#             # Store the best MSE and corresponding hyperparameters
#             if mse < best_mse:
#                 best_mse = mse
#                 best_epochs = num_epochs
#                 best_lr = lr

# After the loop, print the best combination of hyperparameters
# print(f"Best MSE: {best_mse:.4f} achieved with Epochs = {best_epochs} and Learning Rate = {best_lr}")

# Step 6: Make Predictions
preds = model(fund_ids_tensor, features_tensor).detach().numpy()
print("Predicted Returns:", preds)

19
Epoch [10/500], Loss: 7.8674
Epoch [20/500], Loss: 7.7749
Epoch [30/500], Loss: 7.6941
Epoch [40/500], Loss: 7.6587
Epoch [50/500], Loss: 7.6298
Epoch [60/500], Loss: 7.5833
Epoch [70/500], Loss: 7.5200
Epoch [80/500], Loss: 7.4572
Epoch [90/500], Loss: 7.4180
Epoch [100/500], Loss: 7.3925
Epoch [110/500], Loss: 7.3706
Epoch [120/500], Loss: 7.3518
Epoch [130/500], Loss: 7.3343
Epoch [140/500], Loss: 7.3171
Epoch [150/500], Loss: 7.2991
Epoch [160/500], Loss: 7.2798
Epoch [170/500], Loss: 7.2588
Epoch [180/500], Loss: 7.2347
Epoch [190/500], Loss: 7.2073
Epoch [200/500], Loss: 7.1768
Epoch [210/500], Loss: 7.1438
Epoch [220/500], Loss: 7.1085
Epoch [230/500], Loss: 7.0761
Epoch [240/500], Loss: 7.0374
Epoch [250/500], Loss: 6.9969
Epoch [260/500], Loss: 6.9584
Epoch [270/500], Loss: 6.9182
Epoch [280/500], Loss: 6.8812
Epoch [290/500], Loss: 6.8476
Epoch [300/500], Loss: 6.7923
Epoch [310/500], Loss: 6.7521
Epoch [320/500], Loss: 6.7350
Epoch [330/500], Loss: 6.6817
Epoch [340/500],

In [60]:
best_lr

0.01

In [9]:
test = overall_function(dataset = df, outcome = 'rolling_sharpe')
test

Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
64 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted




FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
63 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
62 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
61 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
60 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
59 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor F



FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
57 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
56 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
55 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
54 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
53 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor F



FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
50 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
49 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted




FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
48 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
47 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted




FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
46 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
45 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
44 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
43 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
42 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor F



FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
36 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
35 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
34 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
33 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
32 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor F



FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
10 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
9 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
8 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
7 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitted
FFNN 5-Factor Fitted
6 dates remaining
Naive Fitted
Linear Regression 3-Factor Fitted
Linear Regression 4-Factor Fitted
Linear Regression 5-Factor Fitted
FFNN 3-Factor Fitted
FFNN 4-Factor Fitte

Unnamed: 0,rolling_sharpe,naive,3factlinear,4factlinear,5factlinear,3f_ffnn,4f_ffnn,5f_ffnn
0,0.075880,-0.175525,1.363931,1.547669,1.631241,-0.942977,-1.716935,0.223819
1,0.620161,0.355801,1.524581,1.541747,1.609456,-0.063982,-0.602798,0.712604
2,-0.198237,-0.053156,1.562578,1.554960,1.605086,-0.158528,-0.915611,0.406972
3,-0.721097,-0.824169,1.208331,1.189372,1.320787,-0.803663,-1.628921,-0.216014
4,0.541852,0.341311,1.546185,1.534102,1.635006,-0.032638,-0.623966,0.661172
...,...,...,...,...,...,...,...,...
16200,0.622349,0.910776,0.681563,0.809343,0.730812,0.710803,0.729899,0.646106
16201,0.676394,0.622349,0.641445,0.732901,0.674258,0.676206,0.679003,0.873396
16202,0.691406,0.637319,0.653454,0.744909,0.687654,0.680181,0.698682,0.872134
16203,0.691738,0.637960,0.653550,0.744957,0.687835,0.679359,0.699114,0.872292


In [12]:
test.to_csv('test_data.csv')

Evaluation

In [11]:
x

Unnamed: 0,rolling_sharpe,naive,3factlinear,4factlinear,5factlinear,3f_ffnn,4f_ffnn,5f_ffnn
0,0.075880,-0.175525,1.360503,1.542696,1.625468,0.061885,-0.161848,0.132930
1,0.620161,0.355801,1.478327,1.492509,1.557203,0.560735,0.541244,0.392431
2,-0.198237,-0.053156,1.610368,1.605577,1.658477,0.141161,0.118760,-0.001731
3,-0.721097,-0.824169,1.191242,1.171290,1.301063,-0.531020,-0.794841,-0.773871
4,0.541852,0.341311,1.600140,1.591261,1.694895,0.416219,0.433857,0.328850
...,...,...,...,...,...,...,...,...
16200,0.622349,0.910776,0.734958,0.857291,0.781614,0.677044,0.799949,0.822002
16201,0.676394,0.622349,0.694536,0.780479,0.724728,1.018917,0.743271,0.735199
16202,0.691406,0.637319,0.708815,0.794529,0.740288,1.035343,0.757536,0.748236
16203,0.691738,0.637960,0.708991,0.794650,0.740545,1.036331,0.758097,0.748671


In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    return {"Model": model_name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

def evaluate_all_models(results):
    # models = ['naive_3f', 'ffnn_3f', 'naive_4f', 'ffnn_3f', 'naive_5f', 'ffnn_5f','3factlinear', '4factlinear', '5factlinear']
    # models = ['naive','3factlinear', '4factlinear', '5factlinear']
    models = ['naive', '3f_ffnn', '4f_ffnn', '5f_ffnn','3factlinear', '4factlinear', '5factlinear']
    metrics = []

    for model in models:
        if model in results.columns:
            metrics.append(evaluate_model(results['rolling_sharpe'], results[model], model))
    
    return metrics

In [11]:
metrics_results = evaluate_all_models(test)
metrics_results

[{'Model': 'naive',
  'MAE': 0.42346372977362823,
  'MSE': 1.8246575442019421,
  'RMSE': 1.3507988540867002,
  'R2': -0.05124302986756968},
 {'Model': '3f_ffnn',
  'MAE': 1.0161727908357743,
  'MSE': 1.8295217137888924,
  'RMSE': 1.35259813462421,
  'R2': -0.054045432099497415},
 {'Model': '4f_ffnn',
  'MAE': 1.334735233223388,
  'MSE': 3.424856054874665,
  'RMSE': 1.850636662036788,
  'R2': -0.9731680979958552},
 {'Model': '5f_ffnn',
  'MAE': 0.9732048578157048,
  'MSE': 1.7955154468097552,
  'RMSE': 1.339968449930727,
  'R2': -0.034453344122644},
 {'Model': '3factlinear',
  'MAE': 0.6840248576471603,
  'MSE': 0.8503148538027516,
  'RMSE': 0.9221251833686962,
  'R2': 0.5101066684576324},
 {'Model': '4factlinear',
  'MAE': 0.6664161065771933,
  'MSE': 0.8287739667226197,
  'RMSE': 0.9103702360702594,
  'R2': 0.5225170560791943},
 {'Model': '5factlinear',
  'MAE': 0.6908348243372281,
  'MSE': 0.8738104123369083,
  'RMSE': 0.9347782690760993,
  'R2': 0.49657013267295147}]

Random Test Codes

In [222]:
# from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# from sklearn.metrics import r2_score
# X, y = create_stepped_dataset(create_lagged_dataset(df_3_factor, lag=1,target_var='mth_return'),step=1,target_var='mth_return')
# X = X.replace(np.inf, 0)
# X = X.replace(-np.inf, 0)
# # X

# list_of_dates = df['date'].unique()
# list_of_dates = pd.to_datetime(df['date']) # Converting to date format

# percentile_70 = list_of_dates.quantile(0.7)
# med_date = list_of_dates.loc[(list_of_dates - percentile_70).abs().idxmin()]
# next_date = list_of_dates[list_of_dates > med_date].min()

# X['date_L1'] = pd.to_datetime(X['date_L1'])
# X_train_3f = X[X['date_L1'] < med_date]
# y_train_3f = y[:len(X_train_3f)]
# X_test_3f = X[X['date_L1'] == next_date]
# y_test_3f = y[len(X_train_3f):(len(X_train_3f)+len(X_test_3f))]
# final = y_test_3f.copy()
# X_train_3f = X_train_3f.drop(columns='date_L1')
# X_test_3f = X_test_3f.drop(columns='date_L1')

# scaler = StandardScaler()

# X_train_3f = scaler.fit_transform(X_train_3f)
# X_test_3f = scaler.transform(X_test_3f)

# model = LinearRegression()
# model.fit(X_train_3f, y_train_3f)

# y_pred = model.predict(X_test_3f)

# model2 = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=200)
# model2.fit(X_train_3f, y_train_3f)

# y_pred2 = model2.predict(X_test_3f)

# tscv = TimeSeriesSplit(n_splits = 5)
# ridge_cv = RidgeCV(cv = tscv)
# ridge_cv.fit(X_train_3f, y_train_3f)

# model3 = Ridge(alpha = ridge_cv.alpha_)
# model3.fit(X_train_3f, y_train_3f)
# y_pred3 = model3.predict(X_test_3f)

# lasso_cv = LassoCV(cv = tscv, random_state = 18, max_iter = 100000)
# lasso_cv.fit(X_train_3f, y_train_3f)

# # Create the Lasso model with the optimal alpha value
# lasso_model = Lasso(alpha = lasso_cv.alpha_)
# lasso_model.fit(X_train_3f, y_train_3f)
# y_pred4 = lasso_model.predict(X_test_3f)

# elasticnet_cv = ElasticNetCV(cv = tscv, max_iter = 100000)
# elasticnet_cv.fit(X_train_3f, y_train_3f)
# elasticnet_model = ElasticNet(alpha = elasticnet_cv.alpha_, l1_ratio = elasticnet_cv.l1_ratio_)
# elasticnet_model.fit(X_train_3f, y_train_3f)
        
# y_pred5 = elasticnet_cv.predict(X_test_3f)

# final['linear'] = y_pred
# final['mlp'] = y_pred2
# # final['ridge'] = y_pred3
# # final['lasso'] = y_pred4
# # final['elasticnet'] = y_pred5

# final
# X_test_3f
# y_pred - y_test_3f
# sum(abs((y_pred-y_test_3f)/y_test_3f))/len(y_test_3f)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,mth_return,linear,mlp,ridge,lasso,elasticnet
10946,0.065153,0.021697,0.028095,0.021687,0.020224,0.020162
39688,0.061436,0.025942,0.028074,0.025939,0.025101,0.025072
5363,0.016949,0.016051,0.011053,0.016035,0.014860,0.014763
47379,0.060768,0.019023,0.010773,0.019012,0.018111,0.018036
28685,0.039707,0.019021,0.012601,0.019010,0.018229,0.018155
...,...,...,...,...,...,...
47060,0.062213,0.026357,0.028813,0.026354,0.025445,0.025418
35538,0.063594,0.018598,0.017622,0.018585,0.017400,0.017319
38224,0.060625,0.027637,0.031756,0.027635,0.026746,0.026727
16851,0.013965,0.025182,0.028781,0.025177,0.024109,0.024072


In [229]:
from sklearn.metrics import r2_score
r2_linear = r2_score(final['mth_return'], final['linear'])
r2_mlp = r2_score(final['mth_return'], final['mlp'])
# r2_ridge = r2_score(final['mth_return'], final['ridge'])
# r2_lasso = r2_score(final['mth_return'], final['lasso'])
# r2_elasticnet = r2_score(final['mth_return'], final['elasticnet'])
r2_mlp

-1.672832028647084