# **01. Import Libraries and Load Data**



In [None]:
pip install wrds --no-deps

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import wrds
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import kurtosis, skew
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score

## Connect to WRDS

In [None]:
# Establish a connection to the WRDS
db = wrds.Connection()

# **02. Data Collection**

## Select 50 Top Stocks

In [None]:
# Get the earliest trading date for each permno
query_earliest_date = """
SELECT
    permno,
    MIN(date) as first_trade_date
FROM
    crsp.dsf
GROUP BY
    permno
HAVING
    MIN(date) <= '2000-01-01'
"""

earliest_dates = db.raw_sql(query_earliest_date)

# Ensure stocks are still active until December 31, 2024 (latest available date)
query_active_stocks = """
SELECT
    permno
FROM
    crsp.dsf
WHERE
    date BETWEEN '2000-01-01' AND '2024-12-31'
GROUP BY
    permno
HAVING
    COUNT(DISTINCT date) = (SELECT COUNT(DISTINCT date)
                            FROM crsp.dsf
                            WHERE date BETWEEN '2000-01-01' AND '2024-12-31')
"""

active_stocks = db.raw_sql(query_active_stocks)

# Combine the two sets of stocks to get those listed before 2000 and still active in 2024
filtered_permnos = earliest_dates.merge(active_stocks, on='permno', how='inner')

# Get the list of permnos as a comma-separated string
permnos_str = ','.join([str(permno) for permno in filtered_permnos['permno'].tolist()])

# Get market capitalisation, company name, and sector information for IT sector
query_main = f"""
SELECT
    a.permco,
    a.permno,
    a.date,
    a.shrout,
    a.prc * a.shrout as market_cap,
    b.shrcd,
    b.exchcd,
    b.siccd,
    b.ncusip,
    b.comnam,
    b.ticker
FROM
    crsp.dsf AS a
JOIN
    crsp.dsenames AS b
ON
    a.permno = b.permno
WHERE
    (
        (b.siccd BETWEEN 3570 AND 3579) OR  -- IT-related services (programming, software, etc.)
        (b.siccd BETWEEN 3600 AND 3674) OR
        (b.siccd BETWEEN 7370 AND 7379) OR
        (b.siccd BETWEEN 4810 AND 4813)
    )
    AND a.permno IN ({permnos_str})
    AND a.date = '2024-12-31'
    AND b.exchcd IN (1, 3)
"""

# Execute query
crsp_data = db.raw_sql(query_main)

In [None]:
# Check the results from crsp_data
crsp_data.head()

In [None]:
print("Original dataset size: ", len(crsp_data))
print("Original number of stocks: ", len(set(crsp_data['permno'])))

In [None]:
# Group by ticker and calculate statistics for market_cap for each stock
ticker_stats = crsp_data.groupby('ticker')['market_cap'].describe(percentiles=[.25, .5, .75])

# Calculate kurtosis and skewness for each ticker
ticker_stats['kurtosis'] = crsp_data.groupby('ticker')['market_cap'].apply(lambda x: kurtosis(x, nan_policy='omit'))
ticker_stats['skewness'] = crsp_data.groupby('ticker')['market_cap'].apply(lambda x: skew(x, nan_policy='omit'))

# Step 2: Select and display only the desired statistics (Min, Max, Mean, STD, Kurtosis, Skewness, and Variance)
ticker_stats['variance'] = crsp_data.groupby('ticker')['market_cap'].var()

# Filter desired stats
desired_stats = ticker_stats[['min', 'max', 'mean', 'std', 'variance', 'kurtosis', 'skewness']]

# Rename columns for clarity
desired_stats = desired_stats.rename(columns={
    'min': 'Min',
    'max': 'Max',
    'mean': 'Mean',
    'std': 'STD',
    'variance': 'Variance',
    'kurtosis': 'Kurtosis',
    'skewness': 'Skewness'
})

# Get the top 50 tickers by market capitalization (mean)
top_50_tickers = desired_stats.sort_values(by='Mean', ascending=False).head(50)

# Create plots for each of the statistics

# Set up the figure size for multiple plots (adjusted for 4x2 grid)
plt.figure(figsize=(25, 25))  # Adjust the figure size to fit 7 plots

# Plot for Min values
plt.subplot(4, 2, 1)  # 4 rows, 2 columns, position 1
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['Min'], color='skyblue')
plt.xticks(rotation=90)
plt.title('Min (Market Capitalisation)')

# Plot for Max values
plt.subplot(4, 2, 2)  # 4 rows, 2 columns, position 2
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['Max'], color='lightgreen')
plt.xticks(rotation=90)
plt.title('Max (Market Capitalisation)')

# Plot for Mean
plt.subplot(4, 2, 3)  # 4 rows, 2 columns, position 3
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['Mean'], color='cyan')
plt.xticks(rotation=90)
plt.title('Mean (Market Capitalisation)')

# Plot for STD (Standard Deviation)
plt.subplot(4, 2, 4)  # 4 rows, 2 columns, position 4
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['STD'], color='pink')
plt.xticks(rotation=90)
plt.title('STD (Market Capitalisation)')

# Plot for Variance
plt.subplot(4, 2, 5)  # 4 rows, 2 columns, position 5
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['Variance'], color='lightcoral')
plt.xticks(rotation=90)
plt.title('Variance (Market Capitalisation)')

# Plot for Skewness
plt.subplot(4, 2, 6)  # 4 rows, 2 columns, position 6
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['skewness'], color='lightblue')
plt.xticks(rotation=90)
plt.title('Skewness (Market Capitalisation)')

# Plot for Kurtosis
plt.subplot(4, 2, 7)  # 4 rows, 2 columns, position 7
sns.barplot(x=top_50_tickers.index, y=top_50_tickers['kurtosis'], color='yellow')
plt.xticks(rotation=90)
plt.title('Kurtosis (Market Capitalisation)')

# Adjust layout to prevent overlap (increased padding between plots)
plt.tight_layout(pad=5.0)  # Add more padding between subplots

# Show the plots
plt.show()

# Print the top 50 stats table
print("Top 50 Tickers Market Capitalisation Stats:")
display(top_50_tickers)


In [None]:
# Filter data for the latest date
latest_date = crsp_data['date'].max()
latest_data = crsp_data[crsp_data['date'] == latest_date]

# Group by permco and permno and select the entry with the highest market capitalisation within each group
top_50_IT_stocks = latest_data.groupby(['permco', 'permno']).apply(lambda x: x.nlargest(1, 'market_cap'))

# Sort by market capitalization and get the top 50 stocks
top_50_IT_stocks = top_50_IT_stocks.sort_values(by='market_cap', ascending=False).head(50)
top_50_IT_stocks.reset_index(drop=True, inplace=True)

In [None]:
print(top_50_IT_stocks)

## Word Cloud for Top 50 stocks (Tickers)

In [None]:
pip install matplotlib

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Get the tickers from the top 50 IT stocks
tickers = top_50_IT_stocks['ticker'].dropna().tolist()  # Drop any NaN tickers

# Join the tickers into a single string
tickers_string = ' '.join(tickers)

# Create the WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(tickers_string)

# Display the WordCloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Check for missing values in important columns
missing_data = crsp_data[crsp_data[['market_cap', 'comnam', 'ncusip', 'ticker']].isna().any(axis=1)]

# Display the rows with missing data
print(missing_data)

In [None]:
# Before removing duplicates
print(f"Data size before removing duplicates: {crsp_data.shape}")

# Remove duplicates
crsp_data.drop_duplicates(subset=['permno', 'date', 'date'], keep='first', inplace=True)

# After removing duplicates
print(f"Data size after removing duplicates: {crsp_data.shape}")

## Collect Price and Return Data

In [None]:
# Get permno of the top 50 stocks
top_50_permnos = top_50_IT_stocks['permno'].tolist()

# Convert permno list to a string for the SQL IN clause
permnos_str = ', '.join(map(str, top_50_permnos))

### Download train data

In [None]:
# Define the date range
start_date = '2000-01-01'
end_date = '2015-12-31'

# Query to get data for the specified date range and variables for the top 50 stocks
query = f"""
SELECT
    a.permco,
    a.permno,
    b.comnam,
    b.ticker,
    a.date,
    a.prc,
    a.cfacpr,
    a.ret
FROM
    crsp.dsf AS a
JOIN
    (SELECT permno, comnam, ticker, namedt, nameendt
     FROM crsp.dsenames
     WHERE permno IN ({permnos_str}) -- filter for the top 50 stocks
       AND namedt <= '{end_date}'
       AND (nameendt IS NULL OR nameendt >= '{start_date}')) AS b
ON
    a.permno = b.permno
WHERE
    a.permno IN ({permnos_str})     -- filter for the top 50 stocks
    AND a.date BETWEEN '{start_date}' AND '{end_date}'
    AND a.date >= b.namedt
    AND (a.date <= b.nameendt OR b.nameendt IS NULL)
"""

# Execute query
crsp_train = db.raw_sql(query)
crsp_train.sort_values(by=['permco', 'date'], inplace=True)

In [None]:
# Check for missing values
print(crsp_train.isna().sum())

In [None]:
# Drop rows where 'prc' or 'ret' are missing (NaN)
crsp_train = crsp_train.dropna(subset=['prc', 'ret'])

In [None]:
crsp_train

## Merge the risk-free rate with stock returns (calculate excess returns)

In [None]:
# Query to fetch the daily risk-free rate for the period 2000-2015
query_risk_free = """
SELECT
    date,
    rf
FROM
    ff.factors_daily
WHERE
    date BETWEEN '2000-01-01' AND '2015-12-31'
"""
rf_data = db.raw_sql(query_risk_free)

# Ensure both 'date' columns are in datetime format before merging
crsp_train['date'] = pd.to_datetime(crsp_train['date'], errors='coerce')
rf_data['date'] = pd.to_datetime(rf_data['date'], errors='coerce')

# Merge the risk-free rate with stock data
crsp_train = pd.merge(crsp_train, rf_data, how='left', on='date')

# Adjust the returns by factoring in the price adjustment factor (cfacpr)
crsp_train['adjusted_ret'] = crsp_train['ret'] / crsp_train['cfacpr']

# Calculate excess returns using the adjusted returns
crsp_train['excess_ret'] = crsp_train['adjusted_ret'] - crsp_train['rf']

# Clip abnormal returns to +100% and -100%
crsp_train['excess_ret'] = crsp_train['excess_ret'].clip(lower=-1.0, upper=1.0)

# Convert the excess return to a binary target for directional forecasting
crsp_train['directional_target'] = np.where(crsp_train['excess_ret'] > 0, 1, 0)

# Check the results for train data
crsp_train[['permco', 'permno', 'date', 'adjusted_ret', 'excess_ret']].head()

### Download test data (2016-2024)


In [None]:
# Define the date range
start_date = '2016-01-01'
end_date = '2024-12-31'

# Query to get data for the specified date range and variables for the top 50 stocks
query = f"""
SELECT
    a.permco,
    a.permno,
    b.comnam,
    b.ticker,
    a.date,
    a.prc,
    a.cfacpr,
    a.ret
FROM
    crsp.dsf AS a
JOIN
    (SELECT permno, comnam, ticker, namedt, nameendt
     FROM crsp.dsenames
     WHERE permno IN ({permnos_str}) -- filter for the top 50 stocks
       AND namedt <= '{end_date}'
       AND (nameendt IS NULL OR nameendt >= '{start_date}')) AS b
ON
    a.permno = b.permno
WHERE
    a.permno IN ({permnos_str})       -- filter for the top 50 stocks
    AND a.date BETWEEN '{start_date}' AND '{end_date}'
    AND a.date >= b.namedt
    AND (a.date <= b.nameendt OR b.nameendt IS NULL)
"""
# Execute query
crsp_test = db.raw_sql(query)
crsp_test.sort_values(by=['permco', 'date'], inplace=True)

In [None]:
crsp_test

In [None]:
# Check for missing values
print(crsp_test.isna().sum())

### Calculate Excess Returns for Test Data


In [None]:
# Use the Fama French data to get the daily risk-free rate for the test period (2016-2024)
query_risk_free_test = """
SELECT
    date,
    rf
FROM
    ff.factors_daily
WHERE
    date BETWEEN '2016-01-01' AND '2024-12-31'
"""
rf_data_test = db.raw_sql(query_risk_free_test)

# Merge risk-free rate with test data
crsp_test['date'] = pd.to_datetime(crsp_test['date'], errors='coerce')
rf_data_test['date'] = pd.to_datetime(rf_data_test['date'], errors='coerce')

# Merge the test data with the risk-free rate data
crsp_test = pd.merge(crsp_test, rf_data_test, how='left', on='date')

# Adjust the returns by factoring in the price adjustment factor (cfacpr)
crsp_test['adjusted_ret'] = crsp_test['ret'] / crsp_test['cfacpr']

# Calculate excess returns using the adjusted returns
crsp_test['excess_ret'] = crsp_test['adjusted_ret'] - crsp_test['rf']

# Clip abnormal returns to +100% and -100%
crsp_test['excess_ret'] = crsp_test['excess_ret'].clip(lower=-1.0, upper=1.0)

# Convert the excess return to a binary target for directional forecasting
crsp_test['directional_target'] = np.where(crsp_test['excess_ret'] > 0, 1, 0)

# Check the results for test data
crsp_test[['permco', 'permno', 'date', 'adjusted_ret', 'excess_ret']].head()

This is because the risk-free rate (rf) is very close to zero around those years.

## Descriptive Statistics for Excess Returns


In [None]:
# Calculate descriptive statistics for excess returns in the training dataset
in_sample_stats = crsp_train["excess_ret"].describe()

# Calculate skewness and kurtosis
skewness = stats.skew(crsp_train["excess_ret"])
kurtosis = stats.kurtosis(crsp_train["excess_ret"])

# Print the statistics in the desired format
print("In-Sample Excess Return Stats:")
print(in_sample_stats)

# Print skewness and kurtosis
print(f"Skewness: {skewness:.4f}")
print(f"Kurtosis: {kurtosis:.4f}")

# Display the dtype
print(f"Name: excess_ret, dtype: {crsp_train['excess_ret'].dtype}")

In [None]:
# Calculate descriptive statistics for excess returns in the testing dataset
out_sample_stats = crsp_test["excess_ret"].describe()

# Calculate skewness and kurtosis
skewness = stats.skew(crsp_test["excess_ret"])
kurtosis = stats.kurtosis(crsp_test["excess_ret"])

# Print the statistics in the desired format
print("Out-Sample Excess Return Stats:")
print(out_sample_stats)

# Print skewness and kurtosis
print(f"Skewness: {skewness:.4f}")
print(f"Kurtosis: {kurtosis:.4f}")

# Display the dtype
print(f"Name: excess_ret, dtype: {crsp_test['excess_ret'].dtype}")

In [None]:
train_stats = crsp_train.groupby('permno')['excess_ret'].describe()
test_stats = crsp_test.groupby('permno')['excess_ret'].describe()

# Print descriptive statistics
print("Descriptive Statistics for Excess Returns (Training Period):")
print(train_stats)

print("\nDescriptive Statistics for Excess Returns (Test Period):")
print(test_stats)

## Create Rolling Windows

In [None]:
def create_lag_features(df, lags):
    # Sort the data by stock ID ('permno') and date to ensure correct time order
    df_sorted = df.sort_values(by=["permno", "date"])

    # Loop through each lag value provided (e.g., 5, 21, 252, 512)
    for lag in lags:
        # Create lag features by shifting excess returns and applying a rolling window
        df[f"lag_{lag}"] = (
            df_sorted.groupby("permno")["excess_ret"]  # Group by stock
            .shift(1)  # Shift by 1 day to avoid lookahead bias
            .rolling(window=lag, min_periods=1)  # Rolling window over past 'lag' days
            .mean()  # Calculate the mean of the rolling window
        )

    # Return the DataFrame with added lag features
    return df

# Example usage for both crsp_train and crsp_test
lag_days_list = [5, 21, 252, 512]  # Example list of lag days

# Apply the function to both crsp_train and crsp_test
crsp_train_lagged = create_lag_features(crsp_train, lag_days_list)
crsp_test_lagged = create_lag_features(crsp_test, lag_days_list)

# Drop rows where any of the lag columns are NaN in crsp_test_lagged
crsp_test_lagged = crsp_test_lagged.dropna(subset=[f'lag_{lag}' for lag in lag_days_list])

# Verify that the lag features are correctly added
print(crsp_train_lagged.head())
print(crsp_test_lagged.head())

# **03. Linear Models**

## OLS + H

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import GridSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    errors = np.abs(y_true - y_pred)
    naive_errors = np.abs(np.diff(y_true))
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy (correct classification of direction)
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# OLS+Huber model to predict excess returns
def ols_huber_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512], threshold=0.5):
    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Hyperparameter grid for Huber
    param_grid = {
        'epsilon': [1.1, 1.2, 1.35, 1.5],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [100, 500, 1000]
    }

    huber = HuberRegressor()
    grid_search = GridSearchCV(huber, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Predicting excess returns using the best model
    y_pred = best_model.predict(X_test)

    # Convert continuous predictions to binary outcomes (0 or 1)
    y_pred_binary = np.where(y_pred > threshold, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)  # Accuracy of prediction
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])  # Accuracy for 'up' direction
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])  # Accuracy for 'down' direction
    mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error (continuous predictions)
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
    mase_value = mase(y_test, y_pred)  # Mean Absolute Scaled Error
    r2 = r2_score(y_test, y_pred)  # R-squared

    # Return the results
    results_df = pd.DataFrame([{
        'Model': 'OLS + H',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    # Save the DataFrame to a CSV file with the correct structure
    pred_df.to_csv(f"ols_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test, pred_df

## Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors (absolute differences)
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Lasso Model with Multiple Features (Lags) and Tuning
def lasso_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512], threshold=0.5):
    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Hyperparameter tuning for Lasso
    param_grid = {'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]}
    grid_search = GridSearchCV(Lasso(max_iter=10000), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Predicting excess returns using the best model
    y_pred = best_model.predict(X_test)

    # Convert continuous predictions to binary outcomes (0 or 1)
    y_pred_binary = np.where(y_pred > threshold, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'Lasso',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Store predicted returns for use in portfolio construction later
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"lasso_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

## Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Ridge Regression with Tuning
def ridge_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):
    # Prepare dataset
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Hyperparameter tuning for Ridge
    param_grid = {'alpha': [0.01, 0.1, 1.0, 10, 100]}
    grid_search_ridge = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search_ridge.fit(X_train, y_train)

    # Corrected: Use grid_search_ridge to get the best model
    best_model = grid_search_ridge.best_estimator_

    # Predictions
    y_pred = best_model.predict(X_test)
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'Ridge',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"ridge_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

## ElasticNet Regression

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# ElasticNet Model with MSE loss and tuning
def elasticnet_model_with_tuning(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):
    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Hyperparameter grid for ElasticNet
    param_grid = {
        'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 1.0]  # 1.0 = Lasso, 0 = Ridge
    }

    # Grid search for ElasticNet
    grid_search = GridSearchCV(ElasticNet(max_iter=10000), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Predict using the best model
    y_pred = best_model.predict(X_test)

    # Convert predictions to binary for directional accuracy
    threshold = 0.5
    y_pred_binary = np.where(y_pred > threshold, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'ElasticNet',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"enet_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

## Principal Components Regression (PCR)

In [None]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

def pcr_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):
    # Prepare dataset
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Check the number of features
    n_samples, n_features = X_train.shape
    max_components = min(n_samples, n_features)

    # If there is only one feature, avoid PCA with multiple components
    param_grid = {'pca__n_components': [1, min(2, max_components), min(3, max_components), min(5, max_components)]}

    pipeline = make_pipeline(
        SimpleImputer(strategy='mean'),
        PCA(),
        LinearRegression()
    )

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'PCR',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"pcr_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

## Generalized Linear Model (GLM)

In [None]:
from sklearn.linear_model import LogisticRegression

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Generalized Linear Model (GLM)
def glm_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):
    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Hyperparameter grid for GLM
    param_grid = {
        'C': [0.1, 1, 10],
        'max_iter': [100, 500, 1000]
    }

    # Logistic Regression model (GLM)
    glm = LogisticRegression(solver='liblinear')
    grid_search = GridSearchCV(glm, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Get best hyperparameters
    best_params = grid_search.best_params_

    # Make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'GLM - Logistic Regression',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"glm_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

# **04. Nonlinear Models**

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    errors = np.abs(y_true - y_pred)
    naive_errors = np.abs(np.diff(y_true))
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Tuning Random Forest globally
def random_forest_model_with_tuning(X_train, y_train):
    param_dist = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    random_search = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_dist,
        n_iter=10,
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_

# Global Random Forest Model
def random_forest_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):
    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['excess_ret'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['excess_ret'])

    X_train = df_train[lag_columns]
    y_train = df_train['excess_ret']
    X_test = df_test[lag_columns]
    y_test = df_test['excess_ret']

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Tune and train the model
    model = random_forest_model_with_tuning(X_train_imputed, y_train)
    y_pred = model.predict(X_test_imputed)
    # Convert continuous predictions to binary outcomes (0 or 1)
    y_pred_binary = np.where(y_pred > 0, 1, 0)

    # Convert actual excess returns to binary for directional accuracy
    y_test_binary = np.where(y_test > 0, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test_binary, y_pred_binary)  # Accuracy of prediction
    up_accuracy = directional_accuracy(y_test_binary[y_test_binary == 1], y_pred_binary[y_test_binary == 1])  # Accuracy for 'up' direction
    down_accuracy = directional_accuracy(y_test_binary[y_test_binary == 0], y_pred_binary[y_test_binary == 0])  # Accuracy for 'down' direction
    mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error (continuous predictions)
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
    mase_value = mase(y_test, y_pred)  # Mean Absolute Scaled Error
    r2 = r2_score(y_test, y_pred)  # R-squared

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'Random Forest',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Add the predicted returns for portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results to a DataFrame
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,        # Stock Identifier
        'date': df_test['date'].values,            # Date corresponding to each stock's excess return
        'excess_ret': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                           # Predicted values
    })

    # Save the predictions to a CSV file
    pred_df.to_csv(f"rf_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

###Gradient Boosted Regression Trees (GBRT)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Utility: Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# GBRT with hyperparameter tuning (with Huber loss)
def gbrt_model_with_tuning(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'loss': ['huber']
    }
    grid_search = GridSearchCV(
        GradientBoostingRegressor(),
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error'
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Global GBRT Model
def gbrt_model(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):
    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    # Tune and train
    model = gbrt_model_with_tuning(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'GBRT',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"gbrt_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test


## Neural Networks

### NN1 (One Hidden Layer, 32 Neurons)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Define NN1 architecture
def create_nn_1model(input_dim, learning_rate=0.0001, l2_reg=0.0001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Global Neural Network Model (NN1)
def neural_network_nn1(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):

    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    model = create_nn_1model(input_dim=X_train.shape[1])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.5, patience=5, min_lr=0.0001)

    # Fit model
    model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping, reduce_lr], verbose=0)

    # Predict
    y_pred = model.predict(X_test).flatten()
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'NN1',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"nn1_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test


### NN2 (Two Hidden Layers, 32 and 16 Neurons)

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Define NN2 architecture
def create_nn_2model(input_dim, learning_rate=0.00001, l2_reg=0.0001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Global Neural Network Model (NN2)
def neural_network_nn2(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):

    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    model = create_nn_1model(input_dim=X_train.shape[1])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.5, patience=5, min_lr=0.0001)

    # Fit model
    model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping, reduce_lr], verbose=0)

    # Predict
    y_pred = model.predict(X_test).flatten()
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'NN2',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"nn2_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

### NN3 (Three Hidden Layers, 32, 16, and 8 Neurons)


In [None]:
# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Define NN3 architecture
def create_nn_3model(input_dim, learning_rate=0.00001, l2_reg=0.0001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Global Neural Network Model (NN3)
def neural_network_nn3(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):

    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    model = create_nn_1model(input_dim=X_train.shape[1])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.5, patience=5, min_lr=0.0001)

    # Fit model
    model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping, reduce_lr], verbose=0)

    # Predict
    y_pred = model.predict(X_test).flatten()
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'NN3',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"nn3_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

### NN4 (Four Hidden Layers, 32, 16, 8, and 4 Neurons)

In [None]:
# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Define NN4 architecture
def create_nn_4model(input_dim, learning_rate=0.00001, l2_reg=0.0001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(4, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Global Neural Network Model (NN4)
def neural_network_nn4(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):

    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    model = create_nn_1model(input_dim=X_train.shape[1])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.5, patience=5, min_lr=0.0001)

    # Fit model
    model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping, reduce_lr], verbose=0)

    # Predict
    y_pred = model.predict(X_test).flatten()
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'NN4',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"nn4_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

### NN5 (Five Hidden Layers, 32, 16, 8, 4, and 2 Neurons)


In [None]:
# Function to calculate MASE
def mase(y_true, y_pred):
    # Calculate the prediction errors
    errors = np.abs(y_true - y_pred)

    # Naïve forecast
    naive_errors = np.abs(np.diff(y_true))

    # Calculate MASE
    mase_value = np.mean(errors) / np.mean(naive_errors)
    return mase_value

# Directional accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Define NN1 architecture
def create_nn_5model(input_dim, learning_rate=0.00001, l2_reg=0.0001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(4, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(2, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Global Neural Network Model (NN5)
def neural_network_nn5(crsp_train_lagged, crsp_test_lagged, lags=[5, 21, 252, 512]):

    # Prepare combined data
    lag_columns = [f'lag_{lag}' for lag in lags]
    df_train = crsp_train_lagged.dropna(subset=lag_columns + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_columns + ['directional_target'])

    X_train = df_train[lag_columns]
    y_train = df_train['directional_target']
    X_test = df_test[lag_columns]
    y_test = df_test['directional_target']

    model = create_nn_1model(input_dim=X_train.shape[1])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.5, patience=5, min_lr=0.0001)

    # Fit model
    model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32,
              callbacks=[early_stopping, reduce_lr], verbose=0)

    # Predict
    y_pred = model.predict(X_test).flatten()
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return a DataFrame with metrics
    results_df = pd.DataFrame([{
        'Model': 'NN2',
        'Lags': str(lags),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'R-squared': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

    # Predicted returns for use in portfolio construction later
    crsp_test_lagged.loc[crsp_test_lagged.index, 'predicted_excess_returns'] = y_pred

    # Save results
    pred_df = pd.DataFrame({
        'permno': df_test['permno'].values,   # Stock identifier
        'date': df_test['date'].values,       # Date corresponding to each stock's excess return
        'y_true': df_test['excess_ret'].values,  # Actual excess returns (true values)
        'y_pred': y_pred                        # Predicted excess returns
    })

    pred_df.to_csv(f"nn5_results_lag{'_'.join(map(str, lags))}_full.csv", index=False)

    # Store predicted excess returns for later use in portfolio construction
    crsp_test_lagged['predicted_excess_returns'] = y_pred

    return results_df, y_pred_binary, y_test

# **05. Run Rolling Forecast**

In [None]:
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

def rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ols', lag=5, alpha=1.0):
    results = []

    # Prepare full-dataset input
    lag_features = [f'lag_{i}' for i in [5, 21, 252, 512]]
    df_train = crsp_train_lagged.dropna(subset=lag_features + ['directional_target'])
    df_test = crsp_test_lagged.dropna(subset=lag_features + ['directional_target'])

    X_train = df_train[lag_features]  # Use multiple lag features
    y_train = df_train['directional_target']
    X_test = df_test[lag_features]
    y_test = df_test['directional_target']

    # DIRECT MODELS
    direct_models = {
        'ols_h': lambda: ols_huber_model(crsp_train_lagged, crsp_test_lagged, lag),
        'lasso': lambda: lasso_model(crsp_train_lagged, crsp_test_lagged, lag),
        'ridge': lambda: ridge_model(crsp_train_lagged, crsp_test_lagged, lag),
        'elasticnet': lambda: elasticnet_model_with_tuning(crsp_train_lagged, crsp_test_lagged, lag),
        'pcr': lambda: pcr_model(crsp_train_lagged, crsp_test_lagged, lag),
        'glm': lambda: glm_model(crsp_train_lagged, crsp_test_lagged, lag),
        'randomforest': lambda: random_forest_model(crsp_train_lagged, crsp_test_lagged, lag),
        'gbrt': lambda: gbrt_model(crsp_train_lagged, crsp_test_lagged, lag),
        'nn1': lambda: neural_network_nn1(crsp_train_lagged, crsp_test_lagged, lag),
        'nn2': lambda: neural_network_nn2(crsp_train_lagged, crsp_test_lagged, lag),
        'nn3': lambda: neural_network_nn3(crsp_train_lagged, crsp_test_lagged, lag),
        'nn4': lambda: neural_network_nn4(crsp_train_lagged, crsp_test_lagged, lag),
        'nn5': lambda: neural_network_nn5(crsp_train_lagged, crsp_test_lagged, lag),
    }

    if model_type in direct_models:
        return direct_models[model_type]()

    # Fit and predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_directional_accuracy = np.mean((y_pred_binary == 1) & (y_test == 1))
    down_directional_accuracy = np.mean((y_pred_binary == 0) & (y_test == 0))
    n = len(y_test)
    p = X_train.shape[1]
    r2_value = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mase_value = mase(y_test, y_pred)

    # Store result
    results.append({
        'Model': model_type.upper(),
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_directional_accuracy,
        'Down Directional Accuracy': down_directional_accuracy,
        'R-squared': adjusted_r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value,
        'Lag': lag
    })

    return pd.DataFrame(results)

# **06. Rename Market Cap**

In [None]:
# Merge crsp_data and crsp_test_lagged on the stock ID (permno)
merged_df = crsp_test_lagged.merge(crsp_data[['permno', 'market_cap']], on='permno', how='left')

# Rename 'market_cap' in merged_df to avoid conflict during merge
merged_df = merged_df.rename(columns={'market_cap': 'market_cap_merged'})

# Merge 'market_cap' (now renamed to 'market_cap_merged') from merged_df into crsp_test_lagged based on 'permco' and 'date'
crsp_test_lagged = crsp_test_lagged.merge(merged_df[['permco', 'date', 'market_cap_merged']], how='left', on=['permco', 'date'])

# **07. Results**

## OLS

In [None]:
# lag 5
ols_h_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ols_h', lag=[5])
print("OLS + H Results (Lag 5):")
display(ols_h_lag5)

In [None]:
# lag 21
ols_h_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ols_h', lag=[21])
print("OLS + H Results (Lag 21):")
display(ols_h_lag21)

In [None]:
# lag 252
ols_h_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ols_h', lag=[252])
print("OLS + H Results (Lag 252):")
display(ols_h_lag252)

In [None]:
# lag 512
ols_h_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ols_h', lag=[512])
print("OLS + H Results (Lag 512):")
display(ols_h_lag512)

In [None]:
# lag 512
ols_h_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ols_h', lag=[512])
print("OLS + H Results (Lag 512):")
display(ols_h_lag512)

## OLS+H Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

def ols_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the OLS + Huber model
    result_df = ols_huber_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    crsp_test_lagged['ols_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using OLS+Huber model
crsp_test_lagged = ols_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ols_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ols_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ols5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_ols5['date'].append(date)
    cumulative_log_returns_by_date_ols5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_ols5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_ols5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_ols5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_ols5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_ols5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_ols5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_ols5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_ols5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_ols5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_ols5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_ols5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_ols_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_ols5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_ols_lag_5.head())

#### With/ Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ols5_c = cumulative_log_returns_ols_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols5_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ols5_c = pd.DataFrame(metrics)
display(metrics_ols5_c)

# Do same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ols5_wc = cumulative_log_returns_ols_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ols5_wc = pd.DataFrame(metrics_wc)
display(metrics_ols5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ols_huber_model` to predict excess returns
def ols_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the OLS + Huber model
    result_df = ols_huber_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['ols_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using OLS+Huber model
crsp_test_lagged = ols_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ols_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ols_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ols21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_ols21['date'].append(date)
    cumulative_log_returns_by_date_ols21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_ols21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_ols21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_ols21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_ols21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_ols21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_ols21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_ols21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_ols21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_ols21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_ols21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_ols21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_ols_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_ols21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_ols_lag_21.head())

#### With/ Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ols21_c = cumulative_log_returns_ols_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols21_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ols21_c = pd.DataFrame(metrics)
display(metrics_ols21_c)


# Do same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ols21_wc = cumulative_log_returns_ols_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ols21_wc = pd.DataFrame(metrics_wc)
display(metrics_ols21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ols_huber_model` to predict excess returns
def ols_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the OLS + Huber model
    result_df = ols_huber_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['ols_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using OLS+Huber model
crsp_test_lagged = ols_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ols_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ols_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ols252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_ols252['date'].append(date)
    cumulative_log_returns_by_date_ols252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_ols252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_ols252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_ols252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_ols252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_ols252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_ols252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_ols252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_ols252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_ols252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_ols252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_ols252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_ols_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_ols252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_ols_lag_252.head())

#### With/ Without Trasaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ols252_c = cumulative_log_returns_ols_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ols252_c = pd.DataFrame(metrics)
display(metrics_ols252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ols252_wc = cumulative_log_returns_ols_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ols252_wc = pd.DataFrame(metrics_wc)
display(metrics_ols252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ols_huber_model` to predict excess returns
def ols_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the OLS + Huber model
    result_df = ols_huber_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['ols_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using OLS+Huber model
crsp_test_lagged = ols_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ols_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ols_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ols512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_ols512['date'].append(date)
    cumulative_log_returns_by_date_ols512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_ols512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_ols512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_ols512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_ols512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_ols512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_ols512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_ols512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_ols512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_ols512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_ols512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_ols512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_ols_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_ols512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_ols_lag_512.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ols512_c = cumulative_log_returns_ols_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ols512_c = pd.DataFrame(metrics)
display(metrics_ols512_c)


# the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ols512_wc = cumulative_log_returns_ols_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ols512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ols512_wc = pd.DataFrame(metrics_wc)
display(metrics_ols512_wc)

## Lasso

In [None]:
# lag 5
lasso_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='lasso', lag=[5])
print("Lasso Results (Lag 5):")
display(lasso_lag5)

In [None]:
# lag 21
lasso_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='lasso', lag=[21])
print("Lasso Results (Lag 21):")
display(lasso_lag21)

In [None]:
# lag 252
lasso_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='lasso', lag=[252])
print("Lasso Results (Lag 252):")
display(lasso_lag252)

In [None]:
# lag 512
lasso_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='lasso', lag=[512])
print("Lasso Results (Lag 512):")
display(lasso_lag512)

In [None]:
# Merge crsp_data and crsp_test_lagged on the stock ID (permno)
merged_df = crsp_test_lagged.merge(crsp_data[['permno', 'market_cap']], on='permno', how='left')

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Predefined valid lags
valid_lags = [5, 21, 252, 512]

# Create an empty list to store results for each lag
r2_results_per_lag = []

# Calculate the quantiles for the market cap
top_25_percentile = merged_df['market_cap'].quantile(0.75)
bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

# Filter top 25% and bottom 25% based on market cap
top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

# Loop over each lag and calculate R² for each group (All, Top 25%, Bottom 25%)
for lag in valid_lags:
    forecast_col = f'lag_{lag}'  # Column name for the forecasted values for each lag
    actual_col = 'excess_ret'  # Actual values column

    # Check if forecast_col exists in merged_df
    if forecast_col not in merged_df.columns:
        print(f"Warning: {forecast_col} not found in merged_df. Skipping lag {lag}.")
        continue

    # Drop NaN values for both actual and forecasted columns before calculating R²
    merged_df_clean = merged_df.dropna(subset=[forecast_col, actual_col])

    # Prepare data for model fitting
    X_train = merged_df_clean[[forecast_col]]  # Use the lag as the feature
    y_train = merged_df_clean[actual_col]

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Fit the Lasso model with GridSearchCV for alpha tuning
    param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 1000]}
    grid_search = GridSearchCV(Lasso(max_iter=10000), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    best_model = grid_search.best_estimator_

    # Make predictions for the entire dataset
    merged_df_clean_copy = merged_df_clean.copy()  # Make a copy to avoid SettingWithCopyWarning
    merged_df_clean_copy.loc[:, 'predicted_ret'] = best_model.predict(X_train_scaled)

    # Calculate R² for All stocks (drop rows with NaN)
    all_r2 = r2_score(merged_df_clean_copy[actual_col], merged_df_clean_copy['predicted_ret'])

    # Scale and prepare data for Top 25% stocks
    top_25_clean = top_25_stocks.dropna(subset=[forecast_col, actual_col]).copy()  # Explicit copy
    X_top_25 = top_25_clean[[forecast_col]]
    y_top_25 = top_25_clean[actual_col]
    X_top_25_scaled = scaler.transform(X_top_25)  # Scale the top 25% data
    top_25_model = Lasso(alpha=grid_search.best_params_['alpha'], max_iter=10000)
    top_25_model.fit(X_top_25_scaled, y_top_25)
    top_25_clean.loc[:, 'predicted_ret'] = top_25_model.predict(X_top_25_scaled)  # .loc[] to avoid warning
    top_25_r2 = r2_score(top_25_clean[actual_col], top_25_clean['predicted_ret'])

    # Scale and prepare data for Bottom 25% stocks
    bottom_25_clean = bottom_25_stocks.dropna(subset=[forecast_col, actual_col]).copy()  # Explicit copy
    X_bottom_25 = bottom_25_clean[[forecast_col]]
    y_bottom_25 = bottom_25_clean[actual_col]
    X_bottom_25_scaled = scaler.transform(X_bottom_25)  # Scale the bottom 25% data
    bottom_25_model = Lasso(alpha=grid_search.best_params_['alpha'], max_iter=10000)
    bottom_25_model.fit(X_bottom_25_scaled, y_bottom_25)
    bottom_25_clean.loc[:, 'predicted_ret'] = bottom_25_model.predict(X_bottom_25_scaled)  # .loc[] to avoid warning
    bottom_25_r2 = r2_score(bottom_25_clean[actual_col], bottom_25_clean['predicted_ret'])

    # Store the results in the list
    r2_results_per_lag.append({
        'Lag': lag,
        'All Stocks R²': all_r2,
        'Top 25% R²': top_25_r2,
        'Bottom 25% R²': bottom_25_r2
    })

# Convert the results into a DataFrame
r2_results_df = pd.DataFrame(r2_results_per_lag)

# Display the results
display(r2_results_df)


## Lasso Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `lasso` to predict excess returns
def lasso_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the Lasso model
    result_df = lasso_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values  # Ensure that the column 'predicted_excess_returns' exists

    # use these predicted returns for portfolio construction
    crsp_test_lagged['lasso_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Lasso model
crsp_test_lagged = lasso_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'lasso_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'lasso_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_lasso5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_lasso5['date'].append(date)
    cumulative_log_returns_by_date_lasso5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_lasso5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_lasso5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_lasso5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_lasso5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_lasso5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_lasso5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_lasso5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_lasso5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_lasso5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_lasso5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_lasso5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_lasso_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_lasso5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_lasso_lag_5.head())

#### With/ Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_lasso5_c = cumulative_log_returns_lasso_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso5_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso5_c = pd.DataFrame(metrics)
display(metrics_lasso5_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_lasso5_wc = cumulative_log_returns_lasso_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso5_wc = pd.DataFrame(metrics_wc)
display(metrics_lasso5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `lasso` to predict excess returns
def lasso_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the Lasso model
    result_df = lasso_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['lasso_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Lasso model (this should generate binary outcomes)
crsp_test_lagged = lasso_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'lasso_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'lasso_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_lasso21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_lasso21['date'].append(date)
    cumulative_log_returns_by_date_lasso21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_lasso21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_lasso21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_lasso21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_lasso21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_lasso21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_lasso21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_lasso21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_lasso21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_lasso21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_lasso21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_lasso21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_lasso_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_lasso21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_lasso_lag_21.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_lasso21_c = cumulative_log_returns_lasso_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso21_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso21_c = pd.DataFrame(metrics)
display(metrics_lasso21_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_lasso21_wc = cumulative_log_returns_lasso_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso21_wc = pd.DataFrame(metrics_wc)
display(metrics_lasso21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `lasso` to predict excess returns
def lasso_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the Lasso model
    result_df = lasso_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['lasso_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Lasso model (this should generate binary outcomes)
crsp_test_lagged = lasso_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'lasso_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'lasso_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_lasso252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_lasso252['date'].append(date)
    cumulative_log_returns_by_date_lasso252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_lasso252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_lasso252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_lasso252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_lasso252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_lasso252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_lasso252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_lasso252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_lasso252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_lasso252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_lasso252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_lasso252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_lasso_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_lasso252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_lasso_lag_252.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_lasso252_c = cumulative_log_returns_lasso_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso252_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso252_c = pd.DataFrame(metrics)
display(metrics_lasso252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_lasso252_wc = cumulative_log_returns_lasso_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso252_wc = pd.DataFrame(metrics_wc)
display(metrics_lasso252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `lasso` to predict excess returns
def lasso_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the Lasso model
    result_df = lasso_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['lasso_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Lasso model (this should generate binary outcomes)
crsp_test_lagged = lasso_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'lasso_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'lasso_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_lasso512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_lasso512['date'].append(date)
    cumulative_log_returns_by_date_lasso512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_lasso512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_lasso512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_lasso512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_lasso512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_lasso512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_lasso512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_lasso512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_lasso512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_lasso512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_lasso512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_lasso512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_lasso_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_lasso512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_lasso_lag_512.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_lasso512_c = cumulative_log_returns_lasso_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso512_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso512_c = pd.DataFrame(metrics)
display(metrics_lasso512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_lasso512_wc = cumulative_log_returns_lasso_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_lasso512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_lasso512_wc = pd.DataFrame(metrics_wc)
display(metrics_lasso512_wc)


## Ridge

In [None]:
# lag 5
ridge_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ridge', lag=[5])
print("Ridge Results (Lag 5):")
display(ridge_lag5)

In [None]:
# lag 21
ridge_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ridge', lag=[21])
print("Ridge Results (Lag 21):")
display(ridge_lag21)

In [None]:
# lag 252
ridge_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ridge', lag=[252])
print("Ridge Results (Lag 252):")
display(ridge_lag252)

In [None]:
# lag 512
ridge_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='ridge', lag=[512])
print("Ridge Results (Lag 512):")
display(ridge_lag512)

### OOS R2

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Predefined valid lags
valid_lags = [5, 21, 252, 512]

# Create an empty list to store results for each lag
r2_results_per_lag = []

# Calculate the quantiles for the market cap
top_25_percentile = merged_df['market_cap'].quantile(0.75)
bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

# Filter top 25% and bottom 25% based on market cap
top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

# Loop over each lag and calculate R² for each group (All, Top 25%, Bottom 25%)
for lag in valid_lags:
    forecast_col = f'lag_{lag}'  # Column name for the forecasted values for each lag
    actual_col = 'excess_ret'  # Actual values column

    # Check if forecast_col exists in merged_df
    if forecast_col not in merged_df.columns:
        print(f"Warning: {forecast_col} not found in merged_df. Skipping lag {lag}.")
        continue

    # Drop NaN values for both actual and forecasted columns before calculating R²
    merged_df_clean = merged_df.dropna(subset=[forecast_col, actual_col])

    # Prepare data for model fitting
    X_train = merged_df_clean[[forecast_col]]  # Use the lag as the feature
    y_train = merged_df_clean[actual_col]

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Fit the Ridge model with GridSearchCV for alpha tuning
    param_grid = {'alpha': [0.01, 0.1, 1.0, 10, 100]}
    grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    best_model = grid_search.best_estimator_

    # Make predictions for the entire dataset
    merged_df_clean_copy = merged_df_clean.copy()  # Make a copy to avoid SettingWithCopyWarning
    merged_df_clean_copy.loc[:, 'predicted_ret'] = best_model.predict(X_train_scaled)

    # Calculate R² for All stocks (drop rows with NaN)
    all_r2 = r2_score(merged_df_clean_copy[actual_col], merged_df_clean_copy['predicted_ret'])

    # Scale and prepare data for Top 25% stocks
    top_25_clean = top_25_stocks.dropna(subset=[forecast_col, actual_col]).copy()  # Explicit copy
    X_top_25 = top_25_clean[[forecast_col]]
    y_top_25 = top_25_clean[actual_col]
    X_top_25_scaled = scaler.transform(X_top_25)  # Scale the top 25% data
    top_25_model = Ridge(alpha=grid_search.best_params_['alpha'])
    top_25_model.fit(X_top_25_scaled, y_top_25)
    top_25_clean.loc[:, 'predicted_ret'] = top_25_model.predict(X_top_25_scaled)  # .loc[] to avoid warning
    top_25_r2 = r2_score(top_25_clean[actual_col], top_25_clean['predicted_ret'])

    # Scale and prepare data for Bottom 25% stocks
    bottom_25_clean = bottom_25_stocks.dropna(subset=[forecast_col, actual_col]).copy()  # Explicit copy
    X_bottom_25 = bottom_25_clean[[forecast_col]]
    y_bottom_25 = bottom_25_clean[actual_col]
    X_bottom_25_scaled = scaler.transform(X_bottom_25)  # Scale the bottom 25% data
    bottom_25_model = Ridge(alpha=grid_search.best_params_['alpha'])
    bottom_25_model.fit(X_bottom_25_scaled, y_bottom_25)
    bottom_25_clean.loc[:, 'predicted_ret'] = bottom_25_model.predict(X_bottom_25_scaled)  # .loc[] to avoid warning
    bottom_25_r2 = r2_score(bottom_25_clean[actual_col], bottom_25_clean['predicted_ret'])

    # Store the results in the list
    r2_results_per_lag.append({
        'Lag': lag,
        'All Stocks R²': all_r2,
        'Top 25% R²': top_25_r2,
        'Bottom 25% R²': bottom_25_r2
    })

# Convert the results into a DataFrame
r2_results_df = pd.DataFrame(r2_results_per_lag)

# Display the results
display(r2_results_df)


## Ridge Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ridge` to predict excess returns
def ridge_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the Ridge model
    result_df = ridge_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['ridge_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Ridge model
crsp_test_lagged = ridge_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ridge_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ridge_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ridge5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_ridge5['date'].append(date)
    cumulative_log_returns_by_date_ridge5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_ridge5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_ridge5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_ridge5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_ridge5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_ridge5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_ridge5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_ridge5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_ridge5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_ridge5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_ridge5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_ridge5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_ridge_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_ridge5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_ridge_lag_5.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ridge5_c = cumulative_log_returns_ridge_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge5_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge5_c = pd.DataFrame(metrics)
display(metrics_ridge5_c)


# the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ridge5_wc = cumulative_log_returns_ridge_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge5_wc = pd.DataFrame(metrics_wc)
display(metrics_ridge5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ridge` to predict excess returns
def ridge_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the Ridge model
    result_df = ridge_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['ridge_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Ridge model
crsp_test_lagged = ridge_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ridge_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ridge_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ridge21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_ridge21['date'].append(date)
    cumulative_log_returns_by_date_ridge21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_ridge21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_ridge21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_ridge21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_ridge21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_ridge21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_ridge21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_ridge21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_ridge21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_ridge21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_ridge21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_ridge21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_ridge_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_ridge21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_ridge_lag_21.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ridge21_c = cumulative_log_returns_ridge_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge21_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge21_c = pd.DataFrame(metrics)
display(metrics_ridge21_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ridge21_wc = cumulative_log_returns_ridge_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge21_wc = pd.DataFrame(metrics_wc)
display(metrics_ridge21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ridge` to predict excess returns
def ridge_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the Ridge model
    result_df = ridge_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['ridge_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Ridge model
crsp_test_lagged = ridge_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ridge_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ridge_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ridge252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_ridge252['date'].append(date)
    cumulative_log_returns_by_date_ridge252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_ridge252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_ridge252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_ridge252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_ridge252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_ridge252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_ridge252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_ridge252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_ridge252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_ridge252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_ridge252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_ridge252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_ridge_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_ridge252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_ridge_lag_252.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ridge252_c = cumulative_log_returns_ridge_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge252_c = pd.DataFrame(metrics)
display(metrics_ridge252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ridge252_wc = cumulative_log_returns_ridge_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge252_wc = pd.DataFrame(metrics_wc)
display(metrics_ridge252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `ridge` to predict excess returns
def ridge_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the Ridge model
    result_df = ridge_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['ridge_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Ridge model
crsp_test_lagged = ridge_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'ridge_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'ridge_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_ridge512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 512 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_ridge512['date'].append(date)
    cumulative_log_returns_by_date_ridge512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_ridge512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_ridge512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_ridge512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_ridge512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_ridge512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_ridge512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_ridge512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_ridge512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_ridge512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_ridge512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_ridge512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_ridge_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_ridge512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_ridge_lag_512.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_ridge512_c = cumulative_log_returns_ridge_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge512_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge512_c = pd.DataFrame(metrics)
display(metrics_ridge512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_ridge512_wc = cumulative_log_returns_ridge_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_ridge512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_ridge512_wc = pd.DataFrame(metrics_wc)
display(metrics_ridge512_wc)

## ElasticNet

In [None]:
# lag 5
elasticnet_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='elasticnet', lag=[5])
print("ElasticNet Results (Lag 5):")
display(elasticnet_lag5)

In [None]:
# lag 21
elasticnet_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='elasticnet', lag=[21])
print("ElasticNet Results (Lag 21):")
display(elasticnet_lag21)

In [None]:
# lag 252
elasticnet_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='elasticnet', lag=[252])
print("ElasticNet Results (Lag 252):")
display(elasticnet_lag252)

In [None]:
# lag 512
elasticnet_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='elasticnet', lag=[512])
print("ElasticNet Results (Lag 512):")
display(elasticnet_lag512)

### OOS R2

In [None]:
# Merge crsp_data and crsp_test_lagged on the stock ID (permno)
merged_df = crsp_test_lagged.merge(crsp_data[['permno', 'market_cap']], on='permno', how='left')

## ElasticNet Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `elasticnet` to predict excess returns
def enet_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the ElasticNet model
    result_df = elasticnet_model_with_tuning(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['enet_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using ElasticNet model (this should generate binary outcomes)
crsp_test_lagged = enet_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'enet_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'enet_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_enet5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_enet5['date'].append(date)
    cumulative_log_returns_by_date_enet5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_enet5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_enet5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_enet5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_enet5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_enet5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_enet5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_enet5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_enet5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_enet5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_enet5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_enet5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_enet_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_enet5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_enet_lag_5.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_enet5_c = cumulative_log_returns_enet_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet5_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_enet5_c = pd.DataFrame(metrics)
display(metrics_enet5_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_enet5_wc = cumulative_log_returns_enet_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_enet5_wc = pd.DataFrame(metrics_wc)
display(metrics_enet5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `Elasticnet` to predict excess returns
def enet_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the ElasticNet model
    result_df = elasticnet_model_with_tuning(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['enet_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using OLS+Huber model
crsp_test_lagged = enet_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'enet_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'enet_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_enet21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_enet21['date'].append(date)
    cumulative_log_returns_by_date_enet21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_enet21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_enet21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_enet21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_enet21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_enet21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_enet21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_enet21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_enet21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_enet21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_enet21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_enet21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_enet_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_enet21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_enet_lag_21.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_enet21_c = cumulative_log_returns_enet_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet21_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_enet21_c = pd.DataFrame(metrics)
display(metrics_enet21_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_enet21_wc = cumulative_log_returns_enet_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_enet21_wc = pd.DataFrame(metrics_wc)
display(metrics_enet21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Step 4: Use the previously defined `Enet` to predict excess returns
def enet_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the ElastcNet model
    result_df = elasticnet_model_with_tuning(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['enet_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using Enet model
crsp_test_lagged = enet_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'enet_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'enet_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_enet252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_enet252['date'].append(date)
    cumulative_log_returns_by_date_enet252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_enet252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_enet252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_enet252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_enet252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_enet252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_enet252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_enet252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_enet252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_enet252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_enet252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_enet252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_enet_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_enet252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_enet_lag_252.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_enet252_c = cumulative_log_returns_enet_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_enet252_c = pd.DataFrame(metrics)
display(metrics_enet252_c)


#  same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_enet252_wc = cumulative_log_returns_enet_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_enet252_wc = pd.DataFrame(metrics_wc)
display(metrics_enet252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `enet` to predict excess returns
def enet_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the enet model
    result_df = elasticnet_model_with_tuning(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['enet_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using enet model (this should generate binary outcomes)
crsp_test_lagged = enet_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'enet_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'enet_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_enet512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 512 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_enet512['date'].append(date)
    cumulative_log_returns_by_date_enet512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_enet512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_enet512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_enet512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_enet512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_enet512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_enet512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_enet512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_enet512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_enet512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_enet512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_enet512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_enet_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_enet512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_enet_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_enet512_c = cumulative_log_returns_enet_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet512_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_enet512_c = pd.DataFrame(metrics)
display(metrics_enet512_c)


# Now, the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_enet512_wc = cumulative_log_returns_enet_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_enet512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_enet512_wc = pd.DataFrame(metrics_wc)
display(metrics_enet512_wc)

## PCR

In [None]:
# lag 5
pcr_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='pcr', lag=[5])
print("PCR Results (Lag 5):")
display(pcr_lag5)

In [None]:
# lag 21
pcr_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='pcr', lag=[21])
print("PCR Results (Lag 5):")
display(pcr_lag21)

In [None]:
# lag 252
pcr_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='pcr', lag=[252])
print("PCR Results (Lag 252):")
display(pcr_lag252)

In [None]:
# lag 512
pcr_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='pcr', lag=[512])
print("PCR Results (Lag 5):")
display(pcr_lag512)

## PCR Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `pcr` to predict excess returns
def pcr_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the PCR model
    result_df = pcr_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values  # Ensure that the column 'predicted_excess_returns' exists

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['pcr_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using PCR model (this should generate binary outcomes)
crsp_test_lagged = pcr_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Step 5: Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'pcr_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'pcr_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_pcr5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_pcr5['date'].append(date)
    cumulative_log_returns_by_date_pcr5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_pcr5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_pcr5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_pcr5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_pcr5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_pcr5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_pcr5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_pcr5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_pcr5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_pcr5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_pcr5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_pcr5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_pcr_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_pcr5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_pcr_lag_5.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_pcr5_c = cumulative_log_returns_pcr_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr5_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr5_c = pd.DataFrame(metrics)
display(metrics_pcr5_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_pcr5_wc = cumulative_log_returns_pcr_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr5_wc = pd.DataFrame(metrics_wc)
display(metrics_pcr5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `pcr` to predict excess returns
def pcr_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the PCR model
    result_df = pcr_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['pcr_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using PCR model
crsp_test_lagged = pcr_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'pcr_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'pcr_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_pcr21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_pcr21['date'].append(date)
    cumulative_log_returns_by_date_pcr21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_pcr21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_pcr21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_pcr21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_pcr21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_pcr21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_pcr21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_pcr21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_pcr21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_pcr21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_pcr21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_pcr21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_pcr_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_pcr21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_pcr_lag_21.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_pcr21_c = cumulative_log_returns_pcr_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr21_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr21_c = pd.DataFrame(metrics)
display(metrics_pcr21_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_pcr21_wc = cumulative_log_returns_pcr_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr21_wc)

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr21_wc = pd.DataFrame(metrics_wc)
display(metrics_pcr21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `pcr` to predict excess returns
def pcr_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the PCR model
    result_df = pcr_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['pcr_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using PCR model (this should generate binary outcomes)
crsp_test_lagged = pcr_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'pcr_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'pcr_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_pcr252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_pcr252['date'].append(date)
    cumulative_log_returns_by_date_pcr252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_pcr252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_pcr252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_pcr252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_pcr252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_pcr252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_pcr252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_pcr252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_pcr252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_pcr252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_pcr252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_pcr252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_pcr_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_pcr252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_pcr_lag_252.head())

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_pcr252_c = cumulative_log_returns_pcr_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr252_c = pd.DataFrame(metrics)
display(metrics_pcr252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_pcr252_wc = cumulative_log_returns_pcr_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr252_wc)

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr252_wc = pd.DataFrame(metrics_wc)
display(metrics_pcr252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `pcr` to predict excess returns
def pcr_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the PCR model
    result_df = pcr_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['pcr_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using PCR model
crsp_test_lagged = pcr_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'pcr_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'pcr_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_pcr512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_pcr512['date'].append(date)
    cumulative_log_returns_by_date_pcr512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_pcr512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_pcr512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_pcr512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_pcr512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_pcr512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_pcr512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_pcr512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_pcr512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_pcr512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_pcr512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_pcr512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_pcr_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_pcr512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_pcr_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_pcr512_c = cumulative_log_returns_pcr_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr512_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr512_c = pd.DataFrame(metrics)
display(metrics_pcr512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_pcr512_wc = cumulative_log_returns_pcr_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_pcr512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_pcr512_wc = pd.DataFrame(metrics_wc)
display(metrics_pcr512_wc)

## GLM

In [None]:
# lag 5
glm_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='glm', lag=[5])
print("GLM Results (Lag 5):")
display(glm_lag5)

In [None]:
# lag 21
glm_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='glm', lag=[21])
print("GLM Results (Lag 21):")
display(glm_lag21)

In [None]:
# lag 252
glm_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='glm', lag=[252])
print("GLM Results (Lag 252):")
display(glm_lag252)

In [None]:
# lag 512
glm_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='glm', lag=[512])
print("GLM Results (Lag 512):")
display(glm_lag512)

### OOS R2

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# Utility: Adjusted R²
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Directional Accuracy
def directional_accuracy(y_true, y_pred_binary):
    return np.mean(y_true == y_pred_binary)

# Logistic Regression (GLM) with tuning — Global Model
def glm_model_with_tuning(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Hyperparameter tuning
    param_grid = {
        'solver': ['lbfgs', 'liblinear', 'saga'],
        'max_iter': [100, 200, 500]
    }

    grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Predictions
    y_pred_prob = best_model.predict_proba(X_test)[:, 1]  # Probabilities for class 1
    y_pred_binary = (y_pred_prob > 0.5).astype(int)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    up_accuracy = directional_accuracy(y_test[y_test == 1], y_pred_binary[y_test == 1])
    down_accuracy = directional_accuracy(y_test[y_test == 0], y_pred_binary[y_test == 0])
    mse = mean_squared_error(y_test, y_pred_prob)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred_prob)
    mase_value = mase(y_test, y_pred_binary)
    adjusted_r2 = adjusted_r2_score(y_test, y_pred_prob, len(y_test), X_train.shape[1])

    return pd.DataFrame([{
        'Model': 'GLM',
        'Lag': lag,
        'Directional Accuracy': accuracy,
        'Up Directional Accuracy': up_accuracy,
        'Down Directional Accuracy': down_accuracy,
        'Adjusted R-squared': adjusted_r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MASE': mase_value
    }])

# Main function to calculate OOS R² for GLM with market cap segmentation
def calculate_oos_r2_market_cap_glm(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    glm_oos_r2_all = pd.DataFrame()
    glm_oos_r2_top_25 = pd.DataFrame()
    glm_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        glm_oos_r2_all = pd.concat([glm_oos_r2_all, glm_model_with_tuning(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        glm_oos_r2_top_25 = pd.concat([glm_oos_r2_top_25, glm_model_with_tuning(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        glm_oos_r2_bottom_25 = pd.concat([glm_oos_r2_bottom_25, glm_model_with_tuning(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': glm_oos_r2_all['Adjusted R-squared'].values,
        'Top 25% R²': glm_oos_r2_top_25['Adjusted R-squared'].values,
        'Bottom 25% R²': glm_oos_r2_bottom_25['Adjusted R-squared'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for GLM with all stocks, top 25%, and bottom 25%
glm_oos_r2_results = calculate_oos_r2_market_cap_glm(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(glm_oos_r2_results)


## GLM Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `glm` to predict excess returns
def glm_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the ElasticNet model
    result_df = glm_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['glm_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using ElasticNet model
crsp_test_lagged = glm_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'glm_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'glm_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_glm5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_glm5['date'].append(date)
    cumulative_log_returns_by_date_glm5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_glm5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_glm5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_glm5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_glm5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_glm5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_glm5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_glm5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_glm5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_glm5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_glm5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_glm5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_glm_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_glm5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_glm_lag_5.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_glm_lag_5.to_csv("cumulative_log_returns_glm_lag_5.csv", index=False)

#### With/Without Transaction Cost

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_glm5_c = cumulative_log_returns_glm_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm5_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_glm5_c = pd.DataFrame(metrics)
display(metrics_glm5_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_glm5_wc = cumulative_log_returns_glm_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_glm5_wc = pd.DataFrame(metrics_wc)
display(metrics_glm5_wc)


### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `glm` to predict excess returns
def glm_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the GLM model
    result_df = glm_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['glm_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GLM model
crsp_test_lagged = glm_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'glm_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'glm_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_glm21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_glm21['date'].append(date)
    cumulative_log_returns_by_date_glm21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_glm21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_glm21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_glm21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_glm21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_glm21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_glm21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_glm21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_glm21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_glm21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_glm21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_glm21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_glm_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_glm21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_glm_lag_21.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_glm_lag_21.to_csv("cumulative_log_returns_glm_lag_21.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_glm21_c = cumulative_log_returns_glm_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm21_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_glm21_c = pd.DataFrame(metrics)
display(metrics_glm21_c)

# the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_glm21_wc = cumulative_log_returns_glm_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_glm21_wc = pd.DataFrame(metrics_wc)
display(metrics_glm21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `glm` to predict excess returns
def glm_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the GLM model
    result_df = glm_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['glm_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GLM model
crsp_test_lagged = glm_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'glm_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'glm_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_glm252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_glm252['date'].append(date)
    cumulative_log_returns_by_date_glm252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_glm252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_glm252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_glm252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_glm252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_glm252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_glm252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_glm252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_glm252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_glm252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_glm252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_glm252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_glm_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_glm252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_glm_lag_252.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_glm_lag_252.to_csv("cumulative_log_returns_glm_lag_252.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_glm252_c = cumulative_log_returns_glm_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm252_c)

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_glm252_c = pd.DataFrame(metrics)
display(metrics_glm252_c)


# Now, the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_glm252_wc = cumulative_log_returns_glm_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_glm252_wc = pd.DataFrame(metrics_wc)
display(metrics_glm252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `GLM` to predict excess returns
def glm_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the GLM model
    result_df = glm_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['glm_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GLM model (this should generate binary outcomes)
crsp_test_lagged = glm_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'glm_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'glm_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_glm512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_glm512['date'].append(date)
    cumulative_log_returns_by_date_glm512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_glm512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_glm512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_glm512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_glm512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_glm512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_glm512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_glm512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_glm512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_glm512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_glm512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_glm512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_glm_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_glm512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_glm_lag_512.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_glm_lag_512.to_csv("cumulative_log_returns_glm_lag_512.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_glm512_c = cumulative_log_returns_glm_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm512_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_glm512_c = pd.DataFrame(metrics)
display(metrics_glm512_c)


# Now, the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_glm512_wc = cumulative_log_returns_glm_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_glm512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_glm512_wc = pd.DataFrame(metrics_wc)
display(metrics_glm512_wc)

## RF

In [None]:
# lag 5
rf_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='randomforest', lag=[5])
print("Random Forest Results (Lag 5):")
display(rf_lag5)

In [None]:
# lag 21
rf_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='randomforest', lag=[21])
print("Random Forest Results (Lag 21):")
display(rf_lag21)

In [None]:
# lag 252
rf_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='randomforest', lag=[252])
print("Random Forest Results (Lag 252):")
display(rf_lag252)

In [None]:
# lag 252
rf_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='randomforest', lag=[252])
print("Random Forest Results (Lag 252):")
display(rf_lag252)

In [None]:
# lag 512
rf_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='randomforest', lag=[512])
print("Random Forest Results (Lag 512):")
display(rf_lag512)

In [None]:
# lag 512
rf_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='randomforest', lag=[512])
print("Random Forest Results (Lag 512):")
display(rf_lag512)

### OOS R2

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Utility: Adjusted R²
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Calculate OOS R² for Random Forest
def calculate_oos_r2_rf(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# Tuning Random Forest model
def random_forest_model_with_tuning(X_train, y_train):
    param_dist = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    random_search = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_dist,
        n_iter=10,
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_

# Random Forest model with OOS R² loss and tuning
def random_forest_model_with_tuning_loss(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Tune and train Random Forest
    model = random_forest_model_with_tuning(X_train_imputed, y_train)

    # Calculate OOS R²
    oos_r2 = calculate_oos_r2_rf(model, X_test_imputed, y_test)

    return pd.DataFrame([{
        'Lag': lag,
        'OOS R²': oos_r2
    }])

# Main function to calculate OOS R² for Random Forest with market cap segmentation
def calculate_oos_r2_market_cap_rf(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    rf_oos_r2_all = pd.DataFrame()
    rf_oos_r2_top_25 = pd.DataFrame()
    rf_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        rf_oos_r2_all = pd.concat([rf_oos_r2_all, random_forest_model_with_tuning_loss(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        rf_oos_r2_top_25 = pd.concat([rf_oos_r2_top_25, random_forest_model_with_tuning_loss(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        rf_oos_r2_bottom_25 = pd.concat([rf_oos_r2_bottom_25, random_forest_model_with_tuning_loss(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': rf_oos_r2_all['OOS R²'].values,
        'Top 25% R²': rf_oos_r2_top_25['OOS R²'].values,
        'Bottom 25% R²': rf_oos_r2_bottom_25['OOS R²'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for Random Forest with all stocks, top 25%, and bottom 25%
rf_oos_r2_results = calculate_oos_r2_market_cap_rf(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(rf_oos_r2_results)


## RF Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `RF` to predict excess returns
def rf_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the RF model
    result_df = random_forest_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['rf_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using RF model
crsp_test_lagged = rf_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'rf_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'rf_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_rf5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_rf5['date'].append(date)
    cumulative_log_returns_by_date_rf5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_rf5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_rf5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_rf5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_rf5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_rf5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_rf5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_rf5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_rf5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_rf5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_rf5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_rf5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_rf_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_rf5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_rf_lag_5.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_rf5_c = cumulative_log_returns_rf_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf5_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_rf5_c = pd.DataFrame(metrics)
display(metrics_rf5_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_rf5_wc = cumulative_log_returns_rf_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_rf5_wc = pd.DataFrame(metrics_wc)
display(metrics_rf5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `Random Forest` to predict excess returns
def rf_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the RF model
    result_df = random_forest_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['rf_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using RF model
crsp_test_lagged = rf_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'rf_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'rf_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_rf21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_rf21['date'].append(date)
    cumulative_log_returns_by_date_rf21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_rf21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_rf21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_rf21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_rf21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_rf21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_rf21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_rf21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_rf21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_rf21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_rf21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_rf21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_rf_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_rf21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_rf_lag_21.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_rf21_c = cumulative_log_returns_rf_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf21_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_rf21_c = pd.DataFrame(metrics)
display(metrics_rf21_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_rf21_wc = cumulative_log_returns_rf_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_rf21_wc = pd.DataFrame(metrics_wc)
display(metrics_rf21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `rf` to predict excess returns
def rf_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the RF model
    result_df = random_forest_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['rf_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using RF model (this should generate binary outcomes)
crsp_test_lagged = rf_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'rf_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'rf_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_rf252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_rf252['date'].append(date)
    cumulative_log_returns_by_date_rf252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_rf252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_rf252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_rf252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_rf252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_rf252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_rf252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_rf252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_rf252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_rf252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_rf252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_rf252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_rf_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_rf252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_rf_lag_252.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_rf252_c = cumulative_log_returns_rf_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_rf252_c = pd.DataFrame(metrics)
display(metrics_rf252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_rf252_wc = cumulative_log_returns_rf_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_rf252_wc = pd.DataFrame(metrics_wc)
display(metrics_rf252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `rf` to predict excess returns
def rf_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the RF model
    result_df = random_forest_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # can use these predicted returns for portfolio construction
    crsp_test_lagged['rf_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using RF model (this should generate binary outcomes)
crsp_test_lagged = rf_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'rf_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'rf_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_rf512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_rf512['date'].append(date)
    cumulative_log_returns_by_date_rf512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_rf512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_rf512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_rf512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_rf512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_rf512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_rf512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_rf512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_rf512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_rf512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_rf512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_rf512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_rf_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_rf512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_rf_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_rf512_c = cumulative_log_returns_rf_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_rf512_c = pd.DataFrame(metrics)
display(metrics_rf512_c)


# Now, the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_rf512_wc = cumulative_log_returns_rf_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_rf512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_rf512_wc = pd.DataFrame(metrics_wc)
display(metrics_rf512_wc)

## GBRT

In [None]:
# lag 5
gbrt_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='gbrt', lag=[5])
print("GBRT Results (Lag 5):")
display(gbrt_lag5)

In [None]:
# lag 21
gbrt_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='gbrt', lag=[21])
print("GBRT Results (Lag 21):")
display(gbrt_lag21)

In [None]:
# lag 252
gbrt_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='gbrt', lag=[252])
print("GBRT Results (Lag 252):")
display(gbrt_lag252)

In [None]:
# lag 512
gbrt_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='gbrt', lag=[512])
print("GBRT Results (Lag 512):")
display(gbrt_lag512)

### OOS R2

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Utility: Adjusted R²
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Calculate OOS R² for GBRT
def calculate_oos_r2_gbrt(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# GBRT model with hyperparameter tuning (with Huber loss)
def gbrt_model_with_tuning(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'loss': ['huber']  # Use Huber loss for robustness
    }
    grid_search = GridSearchCV(
        GradientBoostingRegressor(),
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error'
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# GBRT model with OOS R² loss and tuning
def gbrt_model_with_tuning_loss(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Tune and train GBRT
    model = gbrt_model_with_tuning(X_train, y_train)

    # Calculate OOS R²
    oos_r2 = calculate_oos_r2_gbrt(model, X_test, y_test)

    return pd.DataFrame([{
        'Lag': lag,
        'OOS R²': oos_r2
    }])

# Main function to calculate OOS R² for GBRT with market cap segmentation
def calculate_oos_r2_market_cap_gbrt(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    gbrt_oos_r2_all = pd.DataFrame()
    gbrt_oos_r2_top_25 = pd.DataFrame()
    gbrt_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        gbrt_oos_r2_all = pd.concat([gbrt_oos_r2_all, gbrt_model_with_tuning_loss(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        gbrt_oos_r2_top_25 = pd.concat([gbrt_oos_r2_top_25, gbrt_model_with_tuning_loss(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        gbrt_oos_r2_bottom_25 = pd.concat([gbrt_oos_r2_bottom_25, gbrt_model_with_tuning_loss(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': gbrt_oos_r2_all['OOS R²'].values,
        'Top 25% R²': gbrt_oos_r2_top_25['OOS R²'].values,
        'Bottom 25% R²': gbrt_oos_r2_bottom_25['OOS R²'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for GBRT with all stocks, top 25%, and bottom 25%
gbrt_oos_r2_results = calculate_oos_r2_market_cap_gbrt(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(gbrt_oos_r2_results)


## GBRT Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `gbrt` to predict excess returns
def gbrt_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the GBRT model
    result_df = gbrt_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values  # Ensure that the column 'predicted_excess_returns' exists

    # can use these predicted returns for portfolio construction
    crsp_test_lagged['gbrt_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GBRT model
crsp_test_lagged = gbrt_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'gbrt_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'gbrt_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_gbrt5  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_gbrt5['date'].append(date)
    cumulative_log_returns_by_date_gbrt5['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_gbrt5['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_gbrt5['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_gbrt5['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_gbrt5['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_gbrt5['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_gbrt5['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_gbrt5['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_gbrt5['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_gbrt5['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_gbrt5['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_gbrt5['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_gbrt_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_gbrt5)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_gbrt_lag_5.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_gbrt5_c = cumulative_log_returns_gbrt_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt5_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt5_c = pd.DataFrame(metrics)
display(metrics_gbrt5_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_gbrt5_wc = cumulative_log_returns_gbrt_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt5_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt5_wc = pd.DataFrame(metrics_wc)
display(metrics_gbrt5_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `gbrt` to predict excess returns
def gbrt_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the GBRT model
    result_df = gbrt_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['gbrt_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GBRT model (this should generate binary outcomes)
crsp_test_lagged = gbrt_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'gbrt_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'gbrt_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_gbrt21  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_gbrt21['date'].append(date)
    cumulative_log_returns_by_date_gbrt21['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_gbrt21['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_gbrt21['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_gbrt21['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_gbrt21['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_gbrt21['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_gbrt21['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_gbrt21['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_gbrt21['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_gbrt21['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_gbrt21['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_gbrt21['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_gbrt_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_gbrt21)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_gbrt_lag_21.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_gbrt21_c = cumulative_log_returns_gbrt_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt21_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt21_c = pd.DataFrame(metrics)
display(metrics_gbrt21_c)

#  the same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_gbrt21_wc = cumulative_log_returns_gbrt_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt21_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt21_wc = pd.DataFrame(metrics_wc)
display(metrics_gbrt21_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `gbrt` to predict excess returns
def gbrt_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the GBRT model
    result_df = gbrt_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['gbrt_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GBRT model (this should generate binary outcomes)
crsp_test_lagged = gbrt_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

#  Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'gbrt_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'gbrt_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_gbrt252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_gbrt252['date'].append(date)
    cumulative_log_returns_by_date_gbrt252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_gbrt252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_gbrt252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_gbrt252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_gbrt252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_gbrt252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_gbrt252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_gbrt252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_gbrt252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_gbrt252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_gbrt252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_gbrt252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_gbrt_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_gbrt252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_gbrt_lag_252.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_gbrt_lag_252.to_csv("cumulative_log_returns_gbrt_lag_252.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_gbrt252_c = cumulative_log_returns_gbrt_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt252_c = pd.DataFrame(metrics)
display(metrics_gbrt252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_gbrt252_wc = cumulative_log_returns_gbrt_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt252_wc = pd.DataFrame(metrics_wc)
display(metrics_gbrt252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `gbrt` to predict excess returns
def gbrt_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the GBRT model
    result_df = gbrt_model(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['gbrt_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using GBRT model (this should generate binary outcomes)
crsp_test_lagged = gbrt_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'gbrt_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'gbrt_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_gbrt512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_gbrt512['date'].append(date)
    cumulative_log_returns_by_date_gbrt512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_gbrt512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_gbrt512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_gbrt512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_gbrt512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_gbrt512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_gbrt512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_gbrt512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_gbrt512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_gbrt512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_gbrt512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_gbrt512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_gbrt_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_gbrt512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_gbrt_lag_512.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_gbrt_lag_512.to_csv("cumulative_log_returns_gbrt_lag_512.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_gbrt512_c = cumulative_log_returns_gbrt_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt512_c = pd.DataFrame(metrics)
display(metrics_gbrt512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_gbrt512_wc = cumulative_log_returns_gbrt_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_gbrt512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_gbrt512_wc = pd.DataFrame(metrics_wc)
display(metrics_gbrt512_wc)

### Neural Network

## NN1

In [None]:
# lag 5
nn1_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn1', lag=[5])
print("NN1 Results (Lag 5):")
display(nn1_lag5)

In [None]:
# lag 21
nn1_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn1', lag=[21])
print("NN1 Results (Lag 21):")
display(nn1_lag21)

In [None]:
# lag 21
nn1_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn1', lag=[21])
print("NN1 Results (Lag 21):")
display(nn1_lag21)

In [None]:
# lag 252
nn1_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn1', lag=[252])
print("NN1 Results (Lag 252):")
display(nn1_lag252)

In [None]:
# lag 512
nn1_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn1', lag=[512])
print("NN1 Results (Lag 512):")
display(nn1_lag512)

### OOS R2

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Utility: Adjusted R²
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Calculate OOS R² for NN1
def calculate_oos_r2_nn1(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# Neural Network (NN1) model with hyperparameter tuning
def nn1_model_with_tuning(X_train, y_train):
    param_grid = {
        'hidden_layer_sizes': [(32,)],  # One hidden layer with 32 neurons
        'activation': ['relu', 'tanh'],
        'max_iter': [200, 500],
        'learning_rate_init': [0.001, 0.01]
    }

    grid_search = GridSearchCV(MLPRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# NN1 model with OOS R² loss and tuning
def nn1_model_with_tuning_loss(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Tune and train NN1 model
    model = nn1_model_with_tuning(X_train, y_train)

    # Calculate OOS R²
    oos_r2 = calculate_oos_r2_nn1(model, X_test, y_test)

    return pd.DataFrame([{
        'Lag': lag,
        'OOS R²': oos_r2
    }])

# Main function to calculate OOS R² for NN1 with market cap segmentation
def calculate_oos_r2_market_cap_nn1(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    nn1_oos_r2_all = pd.DataFrame()
    nn1_oos_r2_top_25 = pd.DataFrame()
    nn1_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        nn1_oos_r2_all = pd.concat([nn1_oos_r2_all, nn1_model_with_tuning_loss(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        nn1_oos_r2_top_25 = pd.concat([nn1_oos_r2_top_25, nn1_model_with_tuning_loss(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        nn1_oos_r2_bottom_25 = pd.concat([nn1_oos_r2_bottom_25, nn1_model_with_tuning_loss(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': nn1_oos_r2_all['OOS R²'].values,
        'Top 25% R²': nn1_oos_r2_top_25['OOS R²'].values,
        'Bottom 25% R²': nn1_oos_r2_bottom_25['OOS R²'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for NN1 with all stocks, top 25%, and bottom 25%
nn1_oos_r2_results = calculate_oos_r2_market_cap_nn1(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(nn1_oos_r2_results)


## NN1 Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN1` to predict excess returns
def nn1_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the NN1 model
    result_df = neural_network_nn1(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn1_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN1 model
crsp_test_lagged = nn1_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn1_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn1_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn15  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_nn15['date'].append(date)
    cumulative_log_returns_by_date_nn15['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_nn15['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_nn15['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_nn15['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_nn15['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_nn15['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_nn15['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_nn15['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_nn15['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_nn15['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_nn15['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_nn15['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_nn1_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_nn15)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_nn1_lag_5.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn15_c = cumulative_log_returns_nn1_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn15_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn15_c = pd.DataFrame(metrics)
display(metrics_nn15_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn15_wc = cumulative_log_returns_nn1_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn15_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn15_wc = pd.DataFrame(metrics_wc)
display(metrics_nn15_wc)


### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `laNN1sso` to predict excess returns
def nn1_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the NN1 model
    result_df = neural_network_nn1(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    #  use these predicted returns for portfolio construction
    crsp_test_lagged['nn1_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN1 model
crsp_test_lagged = nn1_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn1_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn1_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn121  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_nn121['date'].append(date)
    cumulative_log_returns_by_date_nn121['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_nn121['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_nn121['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_nn121['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_nn121['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_nn121['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_nn121['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_nn121['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_nn121['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_nn121['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_nn121['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_nn121['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_nn1_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_nn121)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_nn1_lag_21.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn121_c = cumulative_log_returns_nn1_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn121_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn121_c = pd.DataFrame(metrics)
display(metrics_nn121_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn121_wc = cumulative_log_returns_nn1_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn121_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn121_wc = pd.DataFrame(metrics_wc)
display(metrics_nn121_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN1` to predict excess returns
def nn1_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the NN1 model
    result_df = neural_network_nn1(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn1_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN1 model
crsp_test_lagged = nn1_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn1_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn1_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn1252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_nn1252['date'].append(date)
    cumulative_log_returns_by_date_nn1252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_nn1252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_nn1252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_nn1252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_nn1252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_nn1252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_nn1252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_nn1252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_nn1252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_nn1252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_nn1252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_nn1252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_nn1_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_nn1252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_nn1_lag_252.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn1252_c = cumulative_log_returns_nn1_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn1252_c)

    # Apply fixed transaction cost for other metrics
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn1252_c = pd.DataFrame(metrics)
display(metrics_nn1252_c)

# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn1252_wc = cumulative_log_returns_nn1_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn1252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn1252_wc = pd.DataFrame(metrics_wc)
display(metrics_nn1252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN1` to predict excess returns
def nn1_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the NN1 model
    result_df = neural_network_nn1(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn1_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN1 model (this should generate binary outcomes)
crsp_test_lagged = nn1_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn1_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn1_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Step 6: Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn1512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_nn1512['date'].append(date)
    cumulative_log_returns_by_date_nn1512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_nn1512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_nn1512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_nn1512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_nn1512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_nn1512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_nn1512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_nn1512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_nn1512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_nn1512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_nn1512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_nn1512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_nn1_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_nn1512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_nn1_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn1512_c = cumulative_log_returns_nn1_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn1512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn1512_c = pd.DataFrame(metrics)
display(metrics_nn1512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn1512_wc = cumulative_log_returns_nn1_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn1512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn1512_wc = pd.DataFrame(metrics_wc)
display(metrics_nn1512_wc)

## NN2

In [None]:
# lag 5
nn2_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn2', lag=[5])
print("NN2 Results (Lag 5):")
display(nn2_lag5)

In [None]:
# lag 21
nn2_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn2', lag=[21])
print("NN2 Results (Lag 21):")
display(nn2_lag21)

In [None]:
# lag 252
nn2_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn2', lag=[252])
print("NN2 Results (Lag 252):")
display(nn2_lag252)

In [None]:
# lag 512
nn2_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn2', lag=[512])
print("NN2 Results (Lag 512):")
display(nn2_lag512)

### OOS R2

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

# Utility: Adjusted R²
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Calculate OOS R² for NN2
def calculate_oos_r2_nn2(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# NN2 Model with hyperparameter tuning
def nn2_model_with_tuning(X_train, y_train):
    param_grid = {
        'hidden_layer_sizes': [(32, 16)],  # Two hidden layers: 32 and 16 neurons
        'activation': ['relu', 'tanh'],
        'max_iter': [200, 500],
        'learning_rate_init': [0.001, 0.01]
    }

    grid_search = GridSearchCV(MLPRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# NN2 Model with OOS R² calculation
def nn2_model_with_tuning_loss(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Tune and train NN2 model
    model = nn2_model_with_tuning(X_train, y_train)

    # Calculate OOS R²
    oos_r2 = calculate_oos_r2_nn2(model, X_test, y_test)

    return pd.DataFrame([{
        'Lag': lag,
        'OOS R²': oos_r2
    }])

# Main function to calculate OOS R² for NN2 with market cap segmentation
def calculate_oos_r2_market_cap_nn2(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    nn2_oos_r2_all = pd.DataFrame()
    nn2_oos_r2_top_25 = pd.DataFrame()
    nn2_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        nn2_oos_r2_all = pd.concat([nn2_oos_r2_all, nn2_model_with_tuning_loss(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        nn2_oos_r2_top_25 = pd.concat([nn2_oos_r2_top_25, nn2_model_with_tuning_loss(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        nn2_oos_r2_bottom_25 = pd.concat([nn2_oos_r2_bottom_25, nn2_model_with_tuning_loss(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': nn2_oos_r2_all['OOS R²'].values,
        'Top 25% R²': nn2_oos_r2_top_25['OOS R²'].values,
        'Bottom 25% R²': nn2_oos_r2_bottom_25['OOS R²'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for NN2 with all stocks, top 25%, and bottom 25%
nn2_oos_r2_results = calculate_oos_r2_market_cap_nn2(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(nn2_oos_r2_results)


## NN2 Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN2` to predict excess returns
def nn2_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the NN2 model
    result_df = neural_network_nn2(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn2_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN2 model
crsp_test_lagged = nn2_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn2_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn2_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn25  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_nn25['date'].append(date)
    cumulative_log_returns_by_date_nn25['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_nn25['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_nn25['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_nn25['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_nn25['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_nn25['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_nn25['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_nn25['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_nn25['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_nn25['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_nn25['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_nn25['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_nn2_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_nn25)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_nn2_lag_5.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn25_c = cumulative_log_returns_nn2_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn25_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn25_c = pd.DataFrame(metrics)
display(metrics_nn25_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn25_wc = cumulative_log_returns_nn2_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn25_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn25_wc = pd.DataFrame(metrics_wc)
display(metrics_nn25_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN2` to predict excess returns
def nn2_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the NN2 model
    result_df = neural_network_nn2(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn2_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN2 model
crsp_test_lagged = nn2_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn2_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn2_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn221  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_nn221['date'].append(date)
    cumulative_log_returns_by_date_nn221['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_nn221['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_nn221['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_nn221['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_nn221['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_nn221['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_nn221['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_nn221['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_nn221['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_nn221['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_nn221['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_nn221['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_nn2_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_nn221)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_nn2_lag_21.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn221_c = cumulative_log_returns_nn2_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn221_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn221_c = pd.DataFrame(metrics)
display(metrics_nn221_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn221_wc = cumulative_log_returns_nn2_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn221_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn221_wc = pd.DataFrame(metrics_wc)
display(metrics_nn221_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

#  Use the previously defined `NN2` to predict excess returns
def nn2_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the NN2 model
    result_df = neural_network_nn2(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn2_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN2 model
crsp_test_lagged = nn2_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn2_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn2_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn2252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_nn2252['date'].append(date)
    cumulative_log_returns_by_date_nn2252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_nn2252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_nn2252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_nn2252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_nn2252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_nn2252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_nn2252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_nn2252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_nn2252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_nn2252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_nn2252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_nn2252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_nn2_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_nn2252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_nn2_lag_252.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn2252_c = cumulative_log_returns_nn2_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn2252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn2252_c = pd.DataFrame(metrics)
display(metrics_nn2252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn2252_wc = cumulative_log_returns_nn2_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn2252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn2252_wc = pd.DataFrame(metrics_wc)
display(metrics_nn2252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN2` to predict excess returns
def nn2_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the NN2 model
    result_df = neural_network_nn2(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result (it’s added to crsp_test_lagged directly)
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['nn2_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN2 model (this should generate binary outcomes)
crsp_test_lagged = nn2_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn2_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn2_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost
    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn2512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_nn2512['date'].append(date)
    cumulative_log_returns_by_date_nn2512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_nn2512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_nn2512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_nn2512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_nn2512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_nn2512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_nn2512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_nn2512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_nn2512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_nn2512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_nn2512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_nn2512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_nn2_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_nn2512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_nn2_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns (for other metrics)
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns (for volatility and standard deviation)
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn2512_c = cumulative_log_returns_nn2_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn2512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn2512_c = pd.DataFrame(metrics)
display(metrics_nn2512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn2512_wc = cumulative_log_returns_nn2_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn2512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn2512_wc = pd.DataFrame(metrics_wc)
display(metrics_nn2512_wc)

## NN3

In [None]:
# lag 5
nn3_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn3', lag=[5])
print("NN3 Results (Lag 5):")
display(nn3_lag5)

In [None]:
# lag 21
nn3_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn3', lag=[21])
print("NN3 Results (Lag 21):")
display(nn3_lag21)

In [None]:
# lag 252
nn3_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn3', lag=[252])
print("NN3 Results (Lag 252):")
display(nn3_lag252)

In [None]:
# lag 512
nn3_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn3', lag=[512])
print("NN3 Results (Lag 512):")
display(nn3_lag512)

### OOS R2

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

# Adjusted R² calculation
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Calculate OOS R² for NN3
def calculate_oos_r2_nn3(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# NN3 Model with hyperparameter tuning
def nn3_model_with_tuning(X_train, y_train):
    param_grid = {
        'hidden_layer_sizes': [(32, 16, 8)],  # Three hidden layers: 32, 16, and 8 neurons
        'activation': ['relu', 'tanh'],
        'max_iter': [200, 500],
        'learning_rate_init': [0.001, 0.01]
    }

    grid_search = GridSearchCV(MLPRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# NN3 Model with OOS R² loss and tuning
def nn3_model_with_tuning_loss(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Tune and train NN3 model
    model = nn3_model_with_tuning(X_train, y_train)

    # Calculate OOS R²
    oos_r2 = calculate_oos_r2_nn3(model, X_test, y_test)

    return pd.DataFrame([{
        'Lag': lag,
        'OOS R²': oos_r2
    }])

# Main function to calculate OOS R² for NN3 with market cap segmentation
def calculate_oos_r2_market_cap_nn3(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    nn3_oos_r2_all = pd.DataFrame()
    nn3_oos_r2_top_25 = pd.DataFrame()
    nn3_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        nn3_oos_r2_all = pd.concat([nn3_oos_r2_all, nn3_model_with_tuning_loss(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        nn3_oos_r2_top_25 = pd.concat([nn3_oos_r2_top_25, nn3_model_with_tuning_loss(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        nn3_oos_r2_bottom_25 = pd.concat([nn3_oos_r2_bottom_25, nn3_model_with_tuning_loss(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': nn3_oos_r2_all['OOS R²'].values,
        'Top 25% R²': nn3_oos_r2_top_25['OOS R²'].values,
        'Bottom 25% R²': nn3_oos_r2_bottom_25['OOS R²'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for NN3 with all stocks, top 25%, and bottom 25%
nn3_oos_r2_results = calculate_oos_r2_market_cap_nn3(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(nn3_oos_r2_results)


## NN3 Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN3` to predict excess returns
def nn3_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the NN3 model
    result_df = neural_network_nn3(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn3_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN3 model
crsp_test_lagged = nn3_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn3_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn3_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn35  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_nn35['date'].append(date)
    cumulative_log_returns_by_date_nn35['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_nn35['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_nn35['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_nn35['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_nn35['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_nn35['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_nn35['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_nn35['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_nn35['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_nn35['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_nn35['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_nn35['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_nn3_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_nn35)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_nn3_lag_5.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn35_c = cumulative_log_returns_nn3_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn35_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn35_c = pd.DataFrame(metrics)
display(metrics_nn35_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn35_wc = cumulative_log_returns_nn3_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn35_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn35_wc = pd.DataFrame(metrics_wc)
display(metrics_nn35_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN3` to predict excess returns
def nn3_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the NN3 model
    result_df = neural_network_nn3(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    #  use these predicted returns for portfolio construction
    crsp_test_lagged['nn3_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN3 model
crsp_test_lagged = nn3_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn3_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn3_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn321  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_nn321['date'].append(date)
    cumulative_log_returns_by_date_nn321['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_nn321['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_nn321['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_nn321['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_nn321['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_nn321['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_nn321['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_nn321['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_nn321['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_nn321['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_nn321['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_nn321['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_nn3_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_nn321)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_nn3_lag_21.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn321_c = cumulative_log_returns_nn3_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn321_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn321_c = pd.DataFrame(metrics)
display(metrics_nn321_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn321_wc = cumulative_log_returns_nn3_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn321_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn321_wc = pd.DataFrame(metrics_wc)
display(metrics_nn321_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Step 4: Use the previously defined `NN3` to predict excess returns
def nn3_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the NN3 model
    result_df = neural_network_nn3(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['nn3_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN3 model
crsp_test_lagged = nn3_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn3_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn3_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn3252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_nn3252['date'].append(date)
    cumulative_log_returns_by_date_nn3252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_nn3252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_nn3252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_nn3252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_nn3252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_nn3252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_nn3252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_nn3252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_nn3252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_nn3252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_nn3252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_nn3252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_nn3_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_nn3252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_nn3_lag_252.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn3252_c = cumulative_log_returns_nn3_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn3252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn3252_c = pd.DataFrame(metrics)
display(metrics_nn3252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn3252_wc = cumulative_log_returns_nn3_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn3252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn3252_wc = pd.DataFrame(metrics_wc)
display(metrics_nn3252_wc)


### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN3` to predict excess returns
def nn3_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the NN3 model
    result_df = neural_network_nn3(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['nn3_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN3 model
crsp_test_lagged = nn3_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn3_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn3_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn3512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_nn3512['date'].append(date)
    cumulative_log_returns_by_date_nn3512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_nn3512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_nn3512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_nn3512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_nn3512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_nn3512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_nn3512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_nn3512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_nn3512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_nn3512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_nn3512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_nn3512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_nn3_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_nn3512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_nn3_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn3512_c = cumulative_log_returns_nn3_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn3512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn3512_c = pd.DataFrame(metrics)
display(metrics_nn3512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn3512_wc = cumulative_log_returns_nn3_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn3512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn3512_wc = pd.DataFrame(metrics_wc)
display(metrics_nn3512_wc)

## NN4

In [None]:
# lag 5
nn4_lag5 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn4', lag=[5])
print("NN4 Results (Lag 5):")
display(nn4_lag5)

In [None]:
# lag 21
nn4_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn4', lag=[21])
print("NN4 Results (Lag 21):")
display(nn4_lag21)

In [None]:
# lag 252
nn4_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn4', lag=[252])
print("NN4 Results (Lag 252):")
display(nn4_lag252)

In [None]:
# lag 512
nn4_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn4', lag=[512])
print("NN4 Results (Lag 512):")
display(nn4_lag512)

### OOS R2

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

# Utility: Adjusted R² calculation
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Utility: Calculate OOS R² for NN4
def calculate_oos_r2_nn4(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# NN4 Model with hyperparameter tuning (4 hidden layers)
def nn4_model_with_tuning(X_train, y_train):
    param_grid = {
        'hidden_layer_sizes': [(32, 16, 8, 4)],  # Four hidden layers: 32, 16, 8, and 4 neurons
        'activation': ['relu', 'tanh'],
        'max_iter': [200, 500],
        'learning_rate_init': [0.001, 0.01]
    }

    grid_search = GridSearchCV(MLPRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# NN4 Model with OOS R² loss and tuning
def nn4_model_with_tuning_loss(train_df, test_df, lag=5):
    feature = f'lag_{lag}'
    df_train = train_df.dropna(subset=[feature, 'directional_target'])
    df_test = test_df.dropna(subset=[feature, 'directional_target'])

    X_train = df_train[[feature]]
    y_train = df_train['directional_target']
    X_test = df_test[[feature]]
    y_test = df_test['directional_target']

    # Tune and train NN4 model
    model = nn4_model_with_tuning(X_train, y_train)

    # Calculate OOS R²
    oos_r2 = calculate_oos_r2_nn4(model, X_test, y_test)

    return pd.DataFrame([{
        'Lag': lag,
        'OOS R²': oos_r2
    }])

# Main function to calculate OOS R² for NN4 with market cap segmentation
def calculate_oos_r2_market_cap_nn4(train_df, test_df, valid_lags):
    # Calculate the quantiles for the market cap
    top_25_percentile = merged_df['market_cap'].quantile(0.75)
    bottom_25_percentile = merged_df['market_cap'].quantile(0.25)

    # Filter top 25% and bottom 25% based on market cap
    top_25_stocks = merged_df[merged_df['market_cap'] >= top_25_percentile]
    bottom_25_stocks = merged_df[merged_df['market_cap'] <= bottom_25_percentile]

    # Create empty DataFrames to store results
    nn4_oos_r2_all = pd.DataFrame()
    nn4_oos_r2_top_25 = pd.DataFrame()
    nn4_oos_r2_bottom_25 = pd.DataFrame()

    # Loop through each lag and calculate OOS R² for all, top 25% and bottom 25% stocks
    for lag in valid_lags:
        # For All Stocks
        nn4_oos_r2_all = pd.concat([nn4_oos_r2_all, nn4_model_with_tuning_loss(train_df, test_df, lag)], ignore_index=True)

        # For Top 25% Stocks
        nn4_oos_r2_top_25 = pd.concat([nn4_oos_r2_top_25, nn4_model_with_tuning_loss(top_25_stocks, test_df, lag)], ignore_index=True)

        # For Bottom 25% Stocks
        nn4_oos_r2_bottom_25 = pd.concat([nn4_oos_r2_bottom_25, nn4_model_with_tuning_loss(bottom_25_stocks, test_df, lag)], ignore_index=True)

    # Merge the results into a single DataFrame with Lag as index
    results_df = pd.DataFrame({
        'Lag': valid_lags,
        'All Stocks R²': nn4_oos_r2_all['OOS R²'].values,
        'Top 25% R²': nn4_oos_r2_top_25['OOS R²'].values,
        'Bottom 25% R²': nn4_oos_r2_bottom_25['OOS R²'].values
    })

    return results_df

# Example: Define lags
valid_lags = [5, 21, 252, 512]

# Calculate OOS R² for NN4 with all stocks, top 25%, and bottom 25%
nn4_oos_r2_results = calculate_oos_r2_market_cap_nn4(crsp_train_lagged, crsp_test_lagged, valid_lags)

# Display the results in the required format
display(nn4_oos_r2_results)


## NN4 Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN4` to predict excess returns
def nn4_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the NN4 model
    result_df = neural_network_nn4(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['nn4_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN4 model
crsp_test_lagged = nn4_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn4_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn4_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn45  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_nn45['date'].append(date)
    cumulative_log_returns_by_date_nn45['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_nn45['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_nn45['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_nn45['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_nn45['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_nn45['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_nn45['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_nn45['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_nn45['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_nn45['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_nn45['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_nn45['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_nn4_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_nn45)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_nn4_lag_5.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn45_c = cumulative_log_returns_nn4_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn45_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn45_c = pd.DataFrame(metrics)
display(metrics_nn45_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn45_wc = cumulative_log_returns_nn4_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn45_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn45_wc = pd.DataFrame(metrics_wc)
display(metrics_nn45_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN4` to predict excess returns
def nn4_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the NN4 model
    result_df = neural_network_nn4(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn4_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN4 model
crsp_test_lagged = nn4_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn4_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn4_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn421  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_nn421['date'].append(date)
    cumulative_log_returns_by_date_nn421['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_nn421['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_nn421['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_nn421['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_nn421['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_nn421['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_nn421['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_nn421['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_nn421['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_nn421['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_nn421['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_nn421['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_nn4_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_nn421)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_nn4_lag_21.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn421_c = cumulative_log_returns_nn4_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn421_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn421_c = pd.DataFrame(metrics)
display(metrics_nn421_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn421_wc = cumulative_log_returns_nn4_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn421_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn421_wc = pd.DataFrame(metrics_wc)
display(metrics_nn421_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

#  Use the previously defined `NN4` to predict excess returns
def nn4_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the NN4 model
    result_df = neural_network_nn4(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['nn4_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN4 model
crsp_test_lagged = nn4_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn4_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn4_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn4252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_nn4252['date'].append(date)
    cumulative_log_returns_by_date_nn4252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_nn4252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_nn4252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_nn4252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_nn4252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_nn4252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_nn4252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_nn4252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_nn4252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_nn4252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_nn4252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_nn4252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_nn4_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_nn4252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_nn4_lag_252.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn4252_c = cumulative_log_returns_nn4_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn4252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn4252_c = pd.DataFrame(metrics)
display(metrics_nn4252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn4252_wc = cumulative_log_returns_nn4_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn4252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn4252_wc = pd.DataFrame(metrics_wc)
display(metrics_nn4252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Step 4: Use the previously defined `NN4` to predict excess returns
def nn4_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the NN4 model
    result_df = neural_network_nn4(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # Now you can use these predicted returns for portfolio construction
    crsp_test_lagged['nn4_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN4 model
crsp_test_lagged = nn4_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn4_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn4_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn4512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_nn4512['date'].append(date)
    cumulative_log_returns_by_date_nn4512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_nn4512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_nn4512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_nn4512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_nn4512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_nn4512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_nn4512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_nn4512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_nn4512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_nn4512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_nn4512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_nn4512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_nn4_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_nn4512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_nn4_lag_512.head())

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn4512_c = cumulative_log_returns_nn4_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn4512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn4512_c = pd.DataFrame(metrics)
display(metrics_nn4512_c)

# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn4512_wc = cumulative_log_returns_nn4_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn4512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn4512_wc = pd.DataFrame(metrics_wc)
display(metrics_nn4512_wc)

## NN5

In [None]:
# lag 5
nn5_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn5', lag=[5])
print("NN5 Results (Lag 5):")
display(nn5_lag21)

In [None]:
# lag 21
nn5_lag21 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn5', lag=[21])
print("NN5 Results (Lag 21):")
display(nn5_lag21)

In [None]:
# lag 252
nn5_lag252 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn5', lag=[252])
print("NN5 Results (Lag 252):")
display(nn5_lag252)

In [None]:
# lag 512
nn5_lag512 = rolling_forecast_model(crsp_train_lagged, crsp_test_lagged, model_type='nn5', lag=[512])
print("NN5 Results (Lag 512):")
display(nn5_lag512)

## NN5 Portfolio

### Lag 5

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN5` to predict excess returns
def nn5_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[5]):
    # Get the predicted excess returns using the NN5 model
    result_df = neural_network_nn5(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn5_5_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN5 model
crsp_test_lagged = nn5_5_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn5_5_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn5_5_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn55  = {
    'date': [],
    'cum_EL_return_5_with_cost': [],
    'cum_ES_return_5_with_cost': [],
    'cum_ELS_return_5_with_cost': [],
    'cum_VL_return_5_with_cost': [],
    'cum_VS_return_5_with_cost': [],
    'cum_VLS_return_5_with_cost': [],
    'cum_EL_return_5_without_cost': [],
    'cum_ES_return_5_without_cost': [],
    'cum_ELS_return_5_without_cost': [],
    'cum_VL_return_5_without_cost': [],
    'cum_VS_return_5_without_cost': [],
    'cum_VLS_return_5_without_cost': []
}

# Initialize cumulative returns for lag 5
cum_EL_return_5_with_cost = 0
cum_ES_return_5_with_cost = 0
cum_ELS_return_5_with_cost = 0
cum_VL_return_5_with_cost = 0
cum_VS_return_5_with_cost = 0
cum_VLS_return_5_with_cost = 0

cum_EL_return_5_without_cost = 0
cum_ES_return_5_without_cost = 0
cum_ELS_return_5_without_cost = 0
cum_VL_return_5_without_cost = 0
cum_VS_return_5_without_cost = 0
cum_VLS_return_5_without_cost = 0

# Iterate over each date to compute returns for lag 5 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 5
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 5
    cum_EL_return_5_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_5_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_5_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_5_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_5_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_5_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_5_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_5_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_5_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_5_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_5_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_5_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 5 portfolios
    cumulative_log_returns_by_date_nn55['date'].append(date)
    cumulative_log_returns_by_date_nn55['cum_EL_return_5_with_cost'].append(cum_EL_return_5_with_cost)
    cumulative_log_returns_by_date_nn55['cum_ES_return_5_with_cost'].append(cum_ES_return_5_with_cost)
    cumulative_log_returns_by_date_nn55['cum_ELS_return_5_with_cost'].append(cum_ELS_return_5_with_cost)
    cumulative_log_returns_by_date_nn55['cum_VL_return_5_with_cost'].append(cum_VL_return_5_with_cost)
    cumulative_log_returns_by_date_nn55['cum_VS_return_5_with_cost'].append(cum_VS_return_5_with_cost)
    cumulative_log_returns_by_date_nn55['cum_VLS_return_5_with_cost'].append(cum_VLS_return_5_with_cost)
    cumulative_log_returns_by_date_nn55['cum_EL_return_5_without_cost'].append(cum_EL_return_5_without_cost)
    cumulative_log_returns_by_date_nn55['cum_ES_return_5_without_cost'].append(cum_ES_return_5_without_cost)
    cumulative_log_returns_by_date_nn55['cum_ELS_return_5_without_cost'].append(cum_ELS_return_5_without_cost)
    cumulative_log_returns_by_date_nn55['cum_VL_return_5_without_cost'].append(cum_VL_return_5_without_cost)
    cumulative_log_returns_by_date_nn55['cum_VS_return_5_without_cost'].append(cum_VS_return_5_without_cost)
    cumulative_log_returns_by_date_nn55['cum_VLS_return_5_without_cost'].append(cum_VLS_return_5_without_cost)

# Convert to DataFrame for lag 5
cumulative_log_returns_nn5_lag_5 = pd.DataFrame(cumulative_log_returns_by_date_nn55)

# Display the cumulative returns DataFrame for lag 5
display(cumulative_log_returns_nn5_lag_5.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_nn5_lag_5.to_csv("cumulative_log_returns_nn5_lag_5.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_5_with_cost', 'cum_ES_return_5_with_cost', 'cum_ELS_return_5_with_cost',
    'cum_VL_return_5_with_cost', 'cum_VS_return_5_with_cost', 'cum_VLS_return_5_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn55_c = cumulative_log_returns_nn5_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn55_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn55_c = pd.DataFrame(metrics)
display(metrics_nn55_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_5_without_cost', 'cum_ES_return_5_without_cost', 'cum_ELS_return_5_without_cost',
    'cum_VL_return_5_without_cost', 'cum_VS_return_5_without_cost', 'cum_VLS_return_5_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn55_wc = cumulative_log_returns_nn5_lag_5[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn55_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn55_wc = pd.DataFrame(metrics_wc)
display(metrics_nn55_wc)

### Lag 21

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN5` to predict excess returns
def nn5_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[21]):
    # Get the predicted excess returns using the NN5 model
    result_df = neural_network_nn5(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn5_21_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN5 model
crsp_test_lagged = nn5_21_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn5_21_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn5_21_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn521  = {
    'date': [],
    'cum_EL_return_21_with_cost': [],
    'cum_ES_return_21_with_cost': [],
    'cum_ELS_return_21_with_cost': [],
    'cum_VL_return_21_with_cost': [],
    'cum_VS_return_21_with_cost': [],
    'cum_VLS_return_21_with_cost': [],
    'cum_EL_return_21_without_cost': [],
    'cum_ES_return_21_without_cost': [],
    'cum_ELS_return_21_without_cost': [],
    'cum_VL_return_21_without_cost': [],
    'cum_VS_return_21_without_cost': [],
    'cum_VLS_return_21_without_cost': []
}

# Initialize cumulative returns for lag 21
cum_EL_return_21_with_cost = 0
cum_ES_return_21_with_cost = 0
cum_ELS_return_21_with_cost = 0
cum_VL_return_21_with_cost = 0
cum_VS_return_21_with_cost = 0
cum_VLS_return_21_with_cost = 0

cum_EL_return_21_without_cost = 0
cum_ES_return_21_without_cost = 0
cum_ELS_return_21_without_cost = 0
cum_VL_return_21_without_cost = 0
cum_VS_return_21_without_cost = 0
cum_VLS_return_21_without_cost = 0

# Iterate over each date to compute returns for lag 21 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 21
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 21
    cum_EL_return_21_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_21_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_21_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_21_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_21_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_21_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_21_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_21_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_21_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_21_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_21_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_21_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 21 portfolios
    cumulative_log_returns_by_date_nn521['date'].append(date)
    cumulative_log_returns_by_date_nn521['cum_EL_return_21_with_cost'].append(cum_EL_return_21_with_cost)
    cumulative_log_returns_by_date_nn521['cum_ES_return_21_with_cost'].append(cum_ES_return_21_with_cost)
    cumulative_log_returns_by_date_nn521['cum_ELS_return_21_with_cost'].append(cum_ELS_return_21_with_cost)
    cumulative_log_returns_by_date_nn521['cum_VL_return_21_with_cost'].append(cum_VL_return_21_with_cost)
    cumulative_log_returns_by_date_nn521['cum_VS_return_21_with_cost'].append(cum_VS_return_21_with_cost)
    cumulative_log_returns_by_date_nn521['cum_VLS_return_21_with_cost'].append(cum_VLS_return_21_with_cost)
    cumulative_log_returns_by_date_nn521['cum_EL_return_21_without_cost'].append(cum_EL_return_21_without_cost)
    cumulative_log_returns_by_date_nn521['cum_ES_return_21_without_cost'].append(cum_ES_return_21_without_cost)
    cumulative_log_returns_by_date_nn521['cum_ELS_return_21_without_cost'].append(cum_ELS_return_21_without_cost)
    cumulative_log_returns_by_date_nn521['cum_VL_return_21_without_cost'].append(cum_VL_return_21_without_cost)
    cumulative_log_returns_by_date_nn521['cum_VS_return_21_without_cost'].append(cum_VS_return_21_without_cost)
    cumulative_log_returns_by_date_nn521['cum_VLS_return_21_without_cost'].append(cum_VLS_return_21_without_cost)

# Convert to DataFrame for lag 21
cumulative_log_returns_nn5_lag_21 = pd.DataFrame(cumulative_log_returns_by_date_nn521)

# Display the cumulative returns DataFrame for lag 21
display(cumulative_log_returns_nn5_lag_21.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_nn5_lag_21.to_csv("cumulative_log_returns_nn5_lag_21.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_21_with_cost', 'cum_ES_return_21_with_cost', 'cum_ELS_return_21_with_cost',
    'cum_VL_return_21_with_cost', 'cum_VS_return_21_with_cost', 'cum_VLS_return_21_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn521_c = cumulative_log_returns_nn5_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn521_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn521_c = pd.DataFrame(metrics)
display(metrics_nn521_c)

#  same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_21_without_cost', 'cum_ES_return_21_without_cost', 'cum_ELS_return_21_without_cost',
    'cum_VL_return_21_without_cost', 'cum_VS_return_21_without_cost', 'cum_VLS_return_21_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn521_wc = cumulative_log_returns_nn5_lag_21[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn521_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn521_wc = pd.DataFrame(metrics_wc)
display(metrics_nn521_wc)

### Lag 252

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN5` to predict excess returns
def nn5_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[252]):
    # Get the predicted excess returns using the NN5 model
    result_df = neural_network_nn5(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn5_252_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN5 model
crsp_test_lagged = nn5_252_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn5_252_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn5_252_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn5252  = {
    'date': [],
    'cum_EL_return_252_with_cost': [],
    'cum_ES_return_252_with_cost': [],
    'cum_ELS_return_252_with_cost': [],
    'cum_VL_return_252_with_cost': [],
    'cum_VS_return_252_with_cost': [],
    'cum_VLS_return_252_with_cost': [],
    'cum_EL_return_252_without_cost': [],
    'cum_ES_return_252_without_cost': [],
    'cum_ELS_return_252_without_cost': [],
    'cum_VL_return_252_without_cost': [],
    'cum_VS_return_252_without_cost': [],
    'cum_VLS_return_252_without_cost': []
}

# Initialize cumulative returns for lag 252
cum_EL_return_252_with_cost = 0
cum_ES_return_252_with_cost = 0
cum_ELS_return_252_with_cost = 0
cum_VL_return_252_with_cost = 0
cum_VS_return_252_with_cost = 0
cum_VLS_return_252_with_cost = 0

cum_EL_return_252_without_cost = 0
cum_ES_return_252_without_cost = 0
cum_ELS_return_252_without_cost = 0
cum_VL_return_252_without_cost = 0
cum_VS_return_252_without_cost = 0
cum_VLS_return_252_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 252
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 252
    cum_EL_return_252_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_252_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_252_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_252_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_252_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_252_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_252_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_252_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_252_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_252_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_252_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_252_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 252 portfolios
    cumulative_log_returns_by_date_nn5252['date'].append(date)
    cumulative_log_returns_by_date_nn5252['cum_EL_return_252_with_cost'].append(cum_EL_return_252_with_cost)
    cumulative_log_returns_by_date_nn5252['cum_ES_return_252_with_cost'].append(cum_ES_return_252_with_cost)
    cumulative_log_returns_by_date_nn5252['cum_ELS_return_252_with_cost'].append(cum_ELS_return_252_with_cost)
    cumulative_log_returns_by_date_nn5252['cum_VL_return_252_with_cost'].append(cum_VL_return_252_with_cost)
    cumulative_log_returns_by_date_nn5252['cum_VS_return_252_with_cost'].append(cum_VS_return_252_with_cost)
    cumulative_log_returns_by_date_nn5252['cum_VLS_return_252_with_cost'].append(cum_VLS_return_252_with_cost)
    cumulative_log_returns_by_date_nn5252['cum_EL_return_252_without_cost'].append(cum_EL_return_252_without_cost)
    cumulative_log_returns_by_date_nn5252['cum_ES_return_252_without_cost'].append(cum_ES_return_252_without_cost)
    cumulative_log_returns_by_date_nn5252['cum_ELS_return_252_without_cost'].append(cum_ELS_return_252_without_cost)
    cumulative_log_returns_by_date_nn5252['cum_VL_return_252_without_cost'].append(cum_VL_return_252_without_cost)
    cumulative_log_returns_by_date_nn5252['cum_VS_return_252_without_cost'].append(cum_VS_return_252_without_cost)
    cumulative_log_returns_by_date_nn5252['cum_VLS_return_252_without_cost'].append(cum_VLS_return_252_without_cost)

# Convert to DataFrame for lag 252
cumulative_log_returns_nn5_lag_252 = pd.DataFrame(cumulative_log_returns_by_date_nn5252)

# Display the cumulative returns DataFrame for lag 252
display(cumulative_log_returns_nn5_lag_252.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_nn5_lag_252.to_csv("cumulative_log_returns_nn5_lag_252.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_252_with_cost', 'cum_ES_return_252_with_cost', 'cum_ELS_return_252_with_cost',
    'cum_VL_return_252_with_cost', 'cum_VS_return_252_with_cost', 'cum_VLS_return_252_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn5252_c = cumulative_log_returns_nn5_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn5252_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn5252_c = pd.DataFrame(metrics)
display(metrics_nn5252_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_252_without_cost', 'cum_ES_return_252_without_cost', 'cum_ELS_return_252_without_cost',
    'cum_VL_return_252_without_cost', 'cum_VS_return_252_without_cost', 'cum_VLS_return_252_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn5252_wc = cumulative_log_returns_nn5_lag_252[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn5252_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn5252_wc = pd.DataFrame(metrics_wc)
display(metrics_nn5252_wc)

### Lag 512

In [None]:
# Add transaction cost (10bps = 0.001)
def calculate_transaction_cost(row):
    return 0.001  # 10 bps for both small and large cap stocks

crsp_test_lagged['transaction_cost'] = crsp_test_lagged.apply(calculate_transaction_cost, axis=1)

# Use the previously defined `NN5` to predict excess returns
def nn5_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged, lags=[512]):
    # Get the predicted excess returns using the NN5 model
    result_df = neural_network_nn5(crsp_train_lagged, crsp_test_lagged, lags)

    # Extract the predicted excess returns from the result
    predicted_excess_returns = crsp_test_lagged['predicted_excess_returns'].values

    # use these predicted returns for portfolio construction
    crsp_test_lagged['nn5_512_predicted_excess_returns'] = predicted_excess_returns

    return crsp_test_lagged

# Get predicted excess returns using NN5 model
crsp_test_lagged = nn5_512_predicted_excess_returns(crsp_train_lagged, crsp_test_lagged)

# Portfolio Construction (Top 10% Long, Bottom 10% Short)
def compute_returns(group):
    # Long position (Top 10% based on predicted returns)
    top_positive = group.nlargest(int(0.1 * len(group)), 'nn5_512_predicted_excess_returns')
    # Short position (Bottom 10% based on predicted returns)
    top_negative = group.nsmallest(int(0.1 * len(group)), 'nn5_512_predicted_excess_returns')

    # Equal-Weighted Long position return (Top 10%)
    equal_long_log_return_with_cost = np.log1p(top_positive['adjusted_ret']).mean() - top_positive['transaction_cost'].mean()
    equal_long_log_return_without_cost = np.log1p(top_positive['adjusted_ret']).mean()

    # Equal-Weighted Short position return (Bottom 10%)
    equal_short_log_return_with_cost = -np.log1p(top_negative['adjusted_ret']).mean() - top_negative['transaction_cost'].mean()
    equal_short_log_return_without_cost = -np.log1p(top_negative['adjusted_ret']).mean()

    # Value-Weighted Long position return (Top 10%) based on market cap
    total_market_cap_positive = top_positive['market_cap_merged'].sum()
    value_long_log_return_with_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive - top_positive['transaction_cost'].mean()
    value_long_log_return_without_cost = (np.log1p(top_positive['adjusted_ret']) * top_positive['market_cap_merged']).sum() / total_market_cap_positive

    # Value-Weighted Short position return (Bottom 10%) based on market cap
    total_market_cap_negative = top_negative['market_cap_merged'].sum()
    value_short_log_return_with_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative + top_negative['transaction_cost'].mean())
    value_short_log_return_without_cost = -((np.log1p(top_negative['adjusted_ret']) * top_negative['market_cap_merged']).sum() / total_market_cap_negative)

    # Combine Long and Short to get Long-Short return
    equal_long_short_log_return_with_cost = equal_long_log_return_with_cost + equal_short_log_return_with_cost
    equal_long_short_log_return_without_cost = equal_long_log_return_without_cost + equal_short_log_return_without_cost

    value_long_short_log_return_with_cost = value_long_log_return_with_cost + value_short_log_return_with_cost
    value_long_short_log_return_without_cost = value_long_log_return_without_cost + value_short_log_return_without_cost

    return {
        'equal_long_log_return_with_cost': equal_long_log_return_with_cost,
        'equal_short_log_return_with_cost': equal_short_log_return_with_cost,
        'equal_long_short_log_return_with_cost': equal_long_short_log_return_with_cost,
        'equal_long_log_return_without_cost': equal_long_log_return_without_cost,
        'equal_short_log_return_without_cost': equal_short_log_return_without_cost,
        'equal_long_short_log_return_without_cost': equal_long_short_log_return_without_cost,
        'value_long_log_return_with_cost': value_long_log_return_with_cost,
        'value_short_log_return_with_cost': value_short_log_return_with_cost,
        'value_long_short_log_return_with_cost': value_long_short_log_return_with_cost,
        'value_long_log_return_without_cost': value_long_log_return_without_cost,
        'value_short_log_return_without_cost': value_short_log_return_without_cost,
        'value_long_short_log_return_without_cost': value_long_short_log_return_without_cost
    }

# Compute cumulative returns for each date with daily rebalancing
cumulative_log_returns_by_date_nn5512  = {
    'date': [],
    'cum_EL_return_512_with_cost': [],
    'cum_ES_return_512_with_cost': [],
    'cum_ELS_return_512_with_cost': [],
    'cum_VL_return_512_with_cost': [],
    'cum_VS_return_512_with_cost': [],
    'cum_VLS_return_512_with_cost': [],
    'cum_EL_return_512_without_cost': [],
    'cum_ES_return_512_without_cost': [],
    'cum_ELS_return_512_without_cost': [],
    'cum_VL_return_512_without_cost': [],
    'cum_VS_return_512_without_cost': [],
    'cum_VLS_return_512_without_cost': []
}

# Initialize cumulative returns for lag 512
cum_EL_return_512_with_cost = 0
cum_ES_return_512_with_cost = 0
cum_ELS_return_512_with_cost = 0
cum_VL_return_512_with_cost = 0
cum_VS_return_512_with_cost = 0
cum_VLS_return_512_with_cost = 0

cum_EL_return_512_without_cost = 0
cum_ES_return_512_without_cost = 0
cum_ELS_return_512_without_cost = 0
cum_VL_return_512_without_cost = 0
cum_VS_return_512_without_cost = 0
cum_VLS_return_512_without_cost = 0

# Iterate over each date to compute returns for lag 252 portfolios
for date in crsp_test_lagged['date'].unique():
    group = crsp_test_lagged[crsp_test_lagged['date'] == date]

    # Compute returns for lag 512
    returns = compute_returns(group)

    # Update cumulative returns with daily values for lag 512
    cum_EL_return_512_with_cost += returns['equal_long_log_return_with_cost']
    cum_ES_return_512_with_cost += returns['equal_short_log_return_with_cost']
    cum_ELS_return_512_with_cost += returns['equal_long_short_log_return_with_cost']
    cum_VL_return_512_with_cost += returns['value_long_log_return_with_cost']
    cum_VS_return_512_with_cost += returns['value_short_log_return_with_cost']
    cum_VLS_return_512_with_cost += returns['value_long_short_log_return_with_cost']

    cum_EL_return_512_without_cost += returns['equal_long_log_return_without_cost']
    cum_ES_return_512_without_cost += returns['equal_short_log_return_without_cost']
    cum_ELS_return_512_without_cost += returns['equal_long_short_log_return_without_cost']
    cum_VL_return_512_without_cost += returns['value_long_log_return_without_cost']
    cum_VS_return_512_without_cost += returns['value_short_log_return_without_cost']
    cum_VLS_return_512_without_cost += returns['value_long_short_log_return_without_cost']

    # Append results for the day for lag 512 portfolios
    cumulative_log_returns_by_date_nn5512['date'].append(date)
    cumulative_log_returns_by_date_nn5512['cum_EL_return_512_with_cost'].append(cum_EL_return_512_with_cost)
    cumulative_log_returns_by_date_nn5512['cum_ES_return_512_with_cost'].append(cum_ES_return_512_with_cost)
    cumulative_log_returns_by_date_nn5512['cum_ELS_return_512_with_cost'].append(cum_ELS_return_512_with_cost)
    cumulative_log_returns_by_date_nn5512['cum_VL_return_512_with_cost'].append(cum_VL_return_512_with_cost)
    cumulative_log_returns_by_date_nn5512['cum_VS_return_512_with_cost'].append(cum_VS_return_512_with_cost)
    cumulative_log_returns_by_date_nn5512['cum_VLS_return_512_with_cost'].append(cum_VLS_return_512_with_cost)
    cumulative_log_returns_by_date_nn5512['cum_EL_return_512_without_cost'].append(cum_EL_return_512_without_cost)
    cumulative_log_returns_by_date_nn5512['cum_ES_return_512_without_cost'].append(cum_ES_return_512_without_cost)
    cumulative_log_returns_by_date_nn5512['cum_ELS_return_512_without_cost'].append(cum_ELS_return_512_without_cost)
    cumulative_log_returns_by_date_nn5512['cum_VL_return_512_without_cost'].append(cum_VL_return_512_without_cost)
    cumulative_log_returns_by_date_nn5512['cum_VS_return_512_without_cost'].append(cum_VS_return_512_without_cost)
    cumulative_log_returns_by_date_nn5512['cum_VLS_return_512_without_cost'].append(cum_VLS_return_512_without_cost)

# Convert to DataFrame for lag 512
cumulative_log_returns_nn5_lag_512 = pd.DataFrame(cumulative_log_returns_by_date_nn5512)

# Display the cumulative returns DataFrame for lag 512
display(cumulative_log_returns_nn5_lag_512.head())

# Saving the DataFrame as a CSV file
cumulative_log_returns_nn5_lag_512.to_csv("cumulative_log_returns_nn5_lag_512.csv", index=False)

In [None]:
# Cumulative returns
def cumulative_return(daily_returns):
    return np.prod(1 + daily_returns) - 1

# Annualized returns
def annualized_return(daily_returns, periods=252):
    cumulative_return_value = np.prod(1 + daily_returns) - 1
    return (1 + cumulative_return_value) ** (periods / len(daily_returns)) - 1

# Sharpe ratio
def sharpe_ratio(daily_returns, risk_free_rate=0.01, periods=252):
    daily_rf = risk_free_rate / periods  # Assuming 252 trading days
    excess_returns = daily_returns - daily_rf
    return np.sqrt(periods) * excess_returns.mean() / excess_returns.std()

# Calculate volatility (standard deviation) of daily returns
def calculate_volatility(daily_returns, periods=252):
    return np.std(daily_returns) * np.sqrt(periods)

# Calculate maximum drawdown
def maximum_drawdown(daily_returns):
    cum_returns = np.cumprod(1 + daily_returns)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Apply fixed transaction cost to the daily returns
def apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001):
    # Subtract the transaction cost from each daily return
    return daily_returns - transaction_cost

# Apply percentage-based transaction cost to the daily returns
def apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001):
    # Apply transaction cost as a percentage of the return
    return daily_returns * (1 - transaction_cost_percentage)

# Prepare portfolio names (with transaction cost)
portfolios_with_cost = [
    'cum_EL_return_512_with_cost', 'cum_ES_return_512_with_cost', 'cum_ELS_return_512_with_cost',
    'cum_VL_return_512_with_cost', 'cum_VS_return_512_with_cost', 'cum_VLS_return_512_with_cost',
]

# Initialize metrics container
metrics = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio with transaction cost
for portfolio in portfolios_with_cost:
    cumulative_returns_nn5512_c = cumulative_log_returns_nn5_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn5512_c)  # Compute daily returns from cumulative log returns

    # Apply fixed transaction cost for other metrics (e.g., cumulative returns, Sharpe ratio)
    daily_returns_after_cost_fixed = apply_transaction_cost_fixed(daily_returns, transaction_cost=0.001)

    # Apply percentage-based transaction cost for volatility and standard deviation
    daily_returns_after_cost_percentage = apply_transaction_cost_percentage(daily_returns, transaction_cost_percentage=0.001)

    # Calculate cumulative returns after fixed transaction cost
    cum_return_after_cost = cumulative_return(daily_returns_after_cost_fixed)

    # Metrics calculations
    ann_return = annualized_return(daily_returns_after_cost_fixed)
    sharpe = sharpe_ratio(daily_returns_after_cost_fixed)
    vol = calculate_volatility(daily_returns_after_cost_percentage)  # Using percentage-based cost for volatility
    max_draw = maximum_drawdown(daily_returns_after_cost_fixed)

    # Standard Deviation of daily returns after cost (using percentage-based cost for standard deviation)
    std_dev = np.std(daily_returns_after_cost_percentage)

    # Store results
    metrics['Portfolio'].append(portfolio)
    metrics['Annualized Return'].append(ann_return)
    metrics['Sharpe Ratio'].append(sharpe)
    metrics['Volatility'].append(vol)
    metrics['Standard Deviation'].append(std_dev)
    metrics['Max Drawdown'].append(max_draw)
    metrics['Cumulative Return'].append(cum_return_after_cost)

# Convert the results into a DataFrame for analysis
metrics_nn5512_c = pd.DataFrame(metrics)
display(metrics_nn5512_c)


# same calculations for portfolios without transaction cost
portfolios_without_cost = [
    'cum_EL_return_512_without_cost', 'cum_ES_return_512_without_cost', 'cum_ELS_return_512_without_cost',
    'cum_VL_return_512_without_cost', 'cum_VS_return_512_without_cost', 'cum_VLS_return_512_without_cost',
]

# Initialize metrics container for portfolios without transaction cost
metrics_wc = {
    'Portfolio': [],
    'Annualized Return': [],
    'Sharpe Ratio': [],
    'Volatility': [],
    'Standard Deviation': [],
    'Max Drawdown': [],
    'Cumulative Return': []
}

# Calculate metrics for each portfolio without transaction cost
for portfolio in portfolios_without_cost:
    cumulative_returns_nn5512_wc = cumulative_log_returns_nn5_lag_512[portfolio].values

    # Calculate daily returns from cumulative returns
    daily_returns = np.diff(cumulative_returns_nn5512_wc)  # Compute daily returns from cumulative log returns

    # Calculate cumulative returns without transaction cost
    cum_return_without_cost = cumulative_return(daily_returns)

    # Metrics calculations
    ann_return = annualized_return(daily_returns)
    sharpe = sharpe_ratio(daily_returns)
    vol = calculate_volatility(daily_returns)
    max_draw = maximum_drawdown(daily_returns)

    # Standard Deviation of daily returns without cost
    std_dev = np.std(daily_returns)

    # Store results
    metrics_wc['Portfolio'].append(portfolio)
    metrics_wc['Annualized Return'].append(ann_return)
    metrics_wc['Sharpe Ratio'].append(sharpe)
    metrics_wc['Volatility'].append(vol)
    metrics_wc['Standard Deviation'].append(std_dev)
    metrics_wc['Max Drawdown'].append(max_draw)
    metrics_wc['Cumulative Return'].append(cum_return_without_cost)

# Convert the results into a DataFrame for analysis
metrics_nn5512_wc = pd.DataFrame(metrics_wc)
display(metrics_nn5512_wc)

# **08. Cumulative Log Return Plots**

## S&P 500 Benchmark

In [None]:
pip install yfinance

In [None]:
import yfinance as yf

# Define benchmark date range
start_date = '2016-01-01'
end_date = '2024-12-31'

# Download S&P 500 index with default multi-index columns
sp500 = yf.download('^GSPC', start=start_date, end=end_date, auto_adjust=False)

# Flatten MultiIndex columns if necessary
sp500.columns = [f"{col[0]}_{col[1]}" for col in sp500.columns]

#  Choose the correct close price column (using 'Adj Close' which is typically the adjusted close price)
price_col = 'Adj Close_^GSPC' if 'Adj Close_^GSPC' in sp500.columns else 'Close_^GSPC'

#  Calculate log returns
sp500['log_return'] = np.log(sp500[price_col] / sp500[price_col].shift(1))
sp500 = sp500.dropna()  # Drop any rows with NaN values

# Calculate cumulative log returns for S&P 500
sp500['cum_SP500_log_return'] = np.cumsum(sp500['log_return'])

# Ensure 'date' column is in datetime format
sp500['date'] = pd.to_datetime(sp500.index)

# Clean up and keep relevant columns
sp500 = sp500[[price_col, 'log_return', 'cum_SP500_log_return']].dropna()
sp500.reset_index(inplace=True)
sp500.rename(columns={'Date': 'date', price_col: 'close'}, inplace=True)

# Final result preview
print(sp500.head())

In [None]:
sp500['log_return'] = np.log(sp500['close'] / sp500['close'].shift(1))
sp500 = sp500.dropna()

daily_returns = sp500['log_return'].values  # These are daily log returns

# Convert log returns to simple returns
simple_returns = np.expm1(daily_returns)  # exp(log_return) - 1 ≈ simple return

# Risk-free rate assumption
risk_free_rate = 0.01  # 1% annualized
periods = 252  # trading days

# Metric Functions
def cumulative_return(r):
    return np.prod(1 + r) - 1

def annualized_return(r):
    return (1 + cumulative_return(r)) ** (periods / len(r)) - 1

def sharpe_ratio(r):
    excess = r - risk_free_rate / periods
    return np.sqrt(periods) * np.mean(excess) / np.std(excess)

def calculate_volatility(r):
    return np.std(r) * np.sqrt(periods)

def maximum_drawdown(r):
    cum_returns = np.cumprod(1 + r)
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns - peak) / peak
    return np.min(drawdown)

# Compute Metrics
sp500_metrics = {
    'Cumulative Return': cumulative_return(simple_returns),
    'Annualized Return': annualized_return(simple_returns),
    'Sharpe Ratio': sharpe_ratio(simple_returns),
    'Volatility': calculate_volatility(simple_returns),
    'Standard Deviation': np.std(simple_returns),
    'Max Drawdown': maximum_drawdown(simple_returns)
}

# Display as DataFrame
sp500_metrics_df = pd.DataFrame(sp500_metrics, index=['S&P 500'])
display(sp500_metrics_df)

In [None]:
# Get the number of observations (rows) in the sp500 dataframe
num_observations = sp500.shape[0]

print(f"Number of observations in the S&P 500 data: {num_observations}")

## Window Size 5

### Plot EW (Without TC)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_5_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_5_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 5 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag5_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_5_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_5_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 5 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag5_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot EW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_5_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_5_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 5 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag5_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_5_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_5_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 5 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag5_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_5_without_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_5_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 5 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag5_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot EW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_5_with_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_5_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 5 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag5_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_VLS_return_5_without_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_5_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 5 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag5_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols5,
    "Lasso": cumulative_log_returns_by_date_lasso5,
    "Ridge": cumulative_log_returns_by_date_ridge5,
    "ElasticNet": cumulative_log_returns_by_date_enet5,
    "PCR": cumulative_log_returns_by_date_pcr5,
    "GLM": cumulative_log_returns_by_date_glm5,
    "Random Forest": cumulative_log_returns_by_date_rf5,
    "GBRT": cumulative_log_returns_by_date_gbrt5,
    "NN1": cumulative_log_returns_by_date_nn15,
    "NN2": cumulative_log_returns_by_date_nn25,
    "NN3": cumulative_log_returns_by_date_nn35,
    "NN4": cumulative_log_returns_by_date_nn45,
    "NN5": cumulative_log_returns_by_date_nn55
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_VLS_return_5_with_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_5_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 5 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag5_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

## Window Size 21

### Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_21_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_21_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 21 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag21_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_21_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_21_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 21 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles (upper right)
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag21_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot EW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_21_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_21_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 21 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)  # Add this legend inside the plot area

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag21_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_21_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_21_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 21 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag21_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_21_without_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_21_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 21 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag21_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot EW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_21_with_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_21_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 21 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag21_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_21_without_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_21_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 21 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag21_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols21,
    "Lasso": cumulative_log_returns_by_date_lasso21,
    "Ridge": cumulative_log_returns_by_date_ridge21,
    "ElasticNet": cumulative_log_returns_by_date_enet21,
    "PCR": cumulative_log_returns_by_date_pcr21,
    "GLM": cumulative_log_returns_by_date_glm21,
    "Random Forest": cumulative_log_returns_by_date_rf21,
    "GBRT": cumulative_log_returns_by_date_gbrt21,
    "NN1": cumulative_log_returns_by_date_nn121,
    "NN2": cumulative_log_returns_by_date_nn221,
    "NN3": cumulative_log_returns_by_date_nn321,
    "NN4": cumulative_log_returns_by_date_nn421,
    "NN5": cumulative_log_returns_by_date_nn521
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_21_with_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_21_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark (optional, you can remove this if not needed)
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 21 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag21_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

## Window Size 252

### Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_252_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_252_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 252 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag252_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_252_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_252_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 252 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag252_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot EW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_252_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_252_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 252 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag252_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_252_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_252_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 252 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag252_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-Short Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_252_without_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_252_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark (optional, you can remove this if not needed)
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 252 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag252_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-Short Plot (EW) (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_252_with_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_252_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 252 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag252_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-Short Plot VW (Without TC)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import pandas as pd

# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_252_without_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_252_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 252 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag252_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-Short Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols252,
    "Lasso": cumulative_log_returns_by_date_lasso252,
    "Ridge": cumulative_log_returns_by_date_ridge252,
    "ElasticNet": cumulative_log_returns_by_date_enet252,
    "PCR": cumulative_log_returns_by_date_pcr252,
    "GLM": cumulative_log_returns_by_date_glm252,
    "Random Forest": cumulative_log_returns_by_date_rf252,
    "GBRT": cumulative_log_returns_by_date_gbrt252,
    "NN1": cumulative_log_returns_by_date_nn1252,
    "NN2": cumulative_log_returns_by_date_nn2252,
    "NN3": cumulative_log_returns_by_date_nn3252,
    "NN4": cumulative_log_returns_by_date_nn4252,
    "NN5": cumulative_log_returns_by_date_nn5252
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_252_with_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_252_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark (optional, you can remove this if not needed)
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 252 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag252_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

## Window Size 512

### Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_512_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_512_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 512 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag512_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_512_without_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_512_without_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 512 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag512_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot EW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_EL_return_512_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_ES_return_512_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Lag 512 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Legend for line styles
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Equal-Weighted_Portfolio_Performance_Lag512_Cost.png"
plot_portfolio_performance(models, sp500, save_path)

### Plot VW (With TC)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import pandas as pd

# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))  # Increased height for better spacing

    # Plot each model's long and short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Long (solid)
        ax.plot(model_data['date'], model_data['cum_VL_return_512_with_cost'],
                color=color, linestyle='solid')
        # Short (dashed)
        ax.plot(model_data['date'], model_data['cum_VS_return_512_with_cost'],
                color=color, linestyle='dashed')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Lag 512 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for model colors (placed inside the plot on the bottom right)
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)  # Add this legend inside the plot area

    # Legend for line styles (upper right)
    style_lines = [
        Line2D([0], [0], color='black', linestyle='solid', linewidth=2),
        Line2D([0], [0], color='black', linestyle='dashed', linewidth=2)
    ]
    ax.legend(style_lines, ['Long', 'Short'], loc='upper right', fontsize=9,
              frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Value-Weighted_Portfolio_Performance_Lag512_Cost.png"
plot_portfolio_performance(models, sp500, save_path)


### Long-short Plot EW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))  # Increased height for better spacing

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_512_without_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_512_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark (optional, you can remove this if not needed)
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 512 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)  # Add this legend inside the plot area

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag512_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)


### Long-short Plot EW (With TC)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import pandas as pd

# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_ELS_return_512_with_cost)
        ax.plot(model_data['date'], model_data['cum_ELS_return_512_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark (optional, you can remove this if not needed)
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Equal-Weighted Portfolio Performance (Long-Short) (Lag 512 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)  # Add this legend inside the plot area

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag512_Cost.png"
plot_portfolio_performance(models, sp500, save_path)


### Long-short Plot VW (Without TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_VLS_return_512_without_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_512_without_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 512 - No Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag512_NoCost.png"
plot_portfolio_performance(models, sp500, save_path)

### Long-short Plot VW (With TC)

In [None]:
# Define models dictionary
models = {
    "OLS": cumulative_log_returns_by_date_ols512,
    "Lasso": cumulative_log_returns_by_date_lasso512,
    "Ridge": cumulative_log_returns_by_date_ridge512,
    "ElasticNet": cumulative_log_returns_by_date_enet512,
    "PCR": cumulative_log_returns_by_date_pcr512,
    "GLM": cumulative_log_returns_by_date_glm512,
    "Random Forest": cumulative_log_returns_by_date_rf512,
    "GBRT": cumulative_log_returns_by_date_gbrt512,
    "NN1": cumulative_log_returns_by_date_nn1512,
    "NN2": cumulative_log_returns_by_date_nn2512,
    "NN3": cumulative_log_returns_by_date_nn3512,
    "NN4": cumulative_log_returns_by_date_nn4512,
    "NN5": cumulative_log_returns_by_date_nn5512
}

# Color palette
color_palette = sns.color_palette("tab20", n_colors=len(models))

def plot_portfolio_performance(models, sp500_data, save_path=None):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Plot each model's combined long-short returns
    for i, (name, model_data) in enumerate(models.items()):
        color = color_palette[i]

        # Plot the combined Long-Short returns (cum_VLS_return_512_with_cost)
        ax.plot(model_data['date'], model_data['cum_VLS_return_512_with_cost'],
                color=color, linestyle='solid', label=f'{name} (Long-Short)')

    # S&P 500 benchmark
    sp500_line, = ax.plot(sp500_data['date'], sp500_data['cum_SP500_log_return'],
                          color='black', linewidth=2, label='S&P 500')

    # Horizontal zero line
    ax.axhline(0, color='black', linestyle=':', linewidth=1)

    # Y-axis: numeric values
    ax.set_ylabel("Cumulative Log Return", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)

    # X-axis label
    ax.set_xlabel("Date", fontsize=12)

    # Title
    ax.set_title("Value-Weighted Portfolio Performance (Long-Short) (Lag 512 - Transaction Cost)", fontsize=14)

    # Grid
    ax.grid(True, linestyle='--', alpha=0.5)

    # Legend for models and S&P 500
    model_handles = [Line2D([0], [0], color=color_palette[i], linewidth=3) for i in range(len(models))]
    model_labels = list(models.keys())

    # Add the S&P 500 to the model legend
    model_handles.append(sp500_line)
    model_labels.append('S&P 500')

    # Model legend in the upper left
    legend1 = ax.legend(model_handles, model_labels, loc='upper left', fontsize=10,
                        frameon=True, fancybox=True, facecolor='white', edgecolor='black', framealpha=0.8)
    ax.add_artist(legend1)

    # Layout adjustment for bottom space
    plt.tight_layout(pad=5.0, rect=[0, 0, 1, 0.9])

    # Save or show
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()

# Example usage
save_path = "Long-Short_Portfolio_Performance_Lag512_Cost.png"
plot_portfolio_performance(models, sp500, save_path)