In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Loading in df
df_healthcare = pd.read_csv("df_ff_factors_100325.csv")
df_healthcare = df_healthcare.drop(columns=["Unnamed: 0","crsp_portno"])
df_healthcare = df_healthcare.sort_values(by='date')
df_healthcare['date'] = pd.to_datetime(df_healthcare['date'])
df_new_healthcare = df_healthcare.copy().iloc[int(len(df_healthcare) * 0.7):]

df_tech = pd.read_csv("df_ff_factors_techfunds.csv")
df_tech = df_tech.drop(columns=["Unnamed: 0","crsp_portno"])
df_tech = df_tech.sort_values(by='date')
df_tech['date'] = pd.to_datetime(df_tech['date'])
df_new_tech = df_tech.copy().iloc[int(len(df_tech) * 0.7):]

# Alpha model training results (all models)

## --- SECTOR 1: HEALTHCARE --- ##
# Dataset 1: with Gradient Boosting
tuned_results_xgboost = pd.read_csv("tuned_results_xgboostlgbm_yearly_tuned_every_round.csv").drop(columns=["Unnamed: 0"]) 

# Dataset 2: with LSTM
tuned_results_lstm = (pd.read_csv("tuned_results.csv").drop(columns=["Unnamed: 0"]))[['rolling_alpha_5f','lstm']]

## --- SECTOR 2: TECHNOLOGY --- ##
# Dataset 1: with Gradient Boosting
tuned_results_tech = pd.read_csv("tech_funds_forecast_tuned.csv").drop(columns=["Unnamed: 0",'lstm'])
# Dataset 2: with LSTM
tuned_results_lstm_tech = pd.read_csv("tech_funds_forecast.csv").drop(columns=["Unnamed: 0"])

In [None]:
# rolling_alphas = "rolling_alpha_5f"

# dataset = df_new_tech[["crsp_fundno", "date", rolling_alphas]]

# merged_df_tech = pd.merge(tuned_results_tech, dataset, how="left", on=rolling_alphas)
# merged_df_no_id_tech = merged_df_tech.drop(columns=['crsp_fundno'])
# merged_df_no_id_tech = merged_df_no_id_tech.set_index('date')
# feature_cols = [col for col in merged_df_no_id_tech.columns if (col != rolling_alphas)]
# combined_df_tech = merged_df_no_id_tech[~merged_df_no_id_tech.index.isna()]
# all_results_tech

In [3]:
rolling_alphas = "rolling_alpha_5f"

dataset = df_new_tech[["crsp_fundno", "date", rolling_alphas]]

merged_df_xgboost = pd.merge(tuned_results_tech, dataset, how="left", on=rolling_alphas)
merged_df_no_id_xgboost = merged_df_xgboost.drop(columns=['crsp_fundno'])
merged_df_no_id_xgboost = merged_df_no_id_xgboost.set_index('date')
feature_cols = [col for col in merged_df_no_id_xgboost.columns if (col != rolling_alphas)]
all_results_tech_xgboost = merged_df_no_id_xgboost[~merged_df_no_id_xgboost.index.isna()]
all_results_tech_xgboost_sorted = (
    all_results_tech_xgboost.reset_index()
    .sort_values(by=['date', 'rolling_alpha_5f'], ascending=True).reset_index().drop(columns='index')
)


merged_df_lstm = pd.merge(tuned_results_lstm_tech, dataset, how="left", on=rolling_alphas)
merged_df_no_id_lstm = merged_df_lstm.drop(columns=['crsp_fundno'])
merged_df_no_id_lstm = merged_df_no_id_lstm.set_index('date')
feature_cols = [col for col in merged_df_no_id_lstm.columns if (col != rolling_alphas)]

all_results_tech_lstm = merged_df_no_id_lstm[~merged_df_no_id_lstm.index.isna()]

all_results_tech_lstm_sorted = (
    all_results_tech_lstm.reset_index()
    .sort_values(by=['date', 'rolling_alpha_5f'], ascending=True).reset_index().drop(columns='index')
)

combined_df_tech = pd.concat([all_results_tech_xgboost_sorted,all_results_tech_lstm_sorted],axis=1)
combined_df_tech = combined_df_tech.loc[:, ~combined_df_tech.columns.duplicated()].set_index('date')

Combination of LSTM results and remaining models for HEALTHCARE industry

In [4]:
rolling_alphas = "rolling_alpha_5f"

dataset = df_new_healthcare[["crsp_fundno", "date", rolling_alphas]]

merged_df_xgboost = pd.merge(tuned_results_xgboost, dataset, how="left", on=rolling_alphas)
merged_df_no_id_xgboost = merged_df_xgboost.drop(columns=['crsp_fundno'])
merged_df_no_id_xgboost = merged_df_no_id_xgboost.set_index('date')
feature_cols = [col for col in merged_df_no_id_xgboost.columns if (col != rolling_alphas)]
all_results_healthcare_xgboost = merged_df_no_id_xgboost[~merged_df_no_id_xgboost.index.isna()]
all_results_healthcare_xgboost_sorted = (
    all_results_healthcare_xgboost.reset_index()
    .sort_values(by=['date', 'rolling_alpha_5f'], ascending=True).reset_index().drop(columns='index')
)


merged_df_lstm = pd.merge(tuned_results_lstm, dataset, how="left", on=rolling_alphas)
merged_df_no_id_lstm = merged_df_lstm.drop(columns=['crsp_fundno'])
merged_df_no_id_lstm = merged_df_no_id_lstm.set_index('date')
feature_cols = [col for col in merged_df_no_id_lstm.columns if (col != rolling_alphas)]

all_results_healthcare_lstm = merged_df_no_id_lstm[~merged_df_no_id_lstm.index.isna()]

all_results_healthcare_lstm_sorted = (
    all_results_healthcare_lstm.reset_index()
    .sort_values(by=['date', 'rolling_alpha_5f'], ascending=True).reset_index().drop(columns='index')
)

combined_df_healthcare = pd.concat([all_results_healthcare_xgboost_sorted,all_results_healthcare_lstm_sorted],axis=1)
combined_df_healthcare = combined_df_healthcare.loc[:, ~combined_df_healthcare.columns.duplicated()].set_index('date')

In [5]:
def overall_df_generator(combined_df):
    list_of_dates_unique = combined_df.index.unique()
    mean_lst = []
    actual_val = combined_df[['rolling_alpha_5f']]
    # Loop through the list of unique dates
    for i in range(len(list_of_dates_unique)):
        if i < 12:  # For the first 12 entries, we cannot have a rolling mean for 12 previous dates
            mean_val = 0  # Set to 0 for first 12 dates (no rolling mean)
            mean_lst.append(mean_val)
            continue

        # Filter values for the previous 12 dates (excluding the current date)
        filtered_val = actual_val[actual_val.index.isin(list_of_dates_unique[i-12:i])]
        mean_val = filtered_val['rolling_alpha_5f'].mean()  # Compute the mean of the previous 12 dates
        mean_lst.append(mean_val)

    mean_df = pd.DataFrame(
        {'date': list_of_dates_unique,
        'benchmark_alpha': mean_lst}
    )

    # Reset index for merging
    df_combined_new_reset = combined_df.reset_index()
    # Merge the original DataFrame with the computed rolling means and setting date as index
    combined_benchmark = pd.merge(df_combined_new_reset, mean_df, on='date', how='left')
    combined_benchmark = combined_benchmark.set_index('date')
    return combined_benchmark

combined_benchmark_healthcare = overall_df_generator(combined_df_healthcare)
combined_benchmark_tech = overall_df_generator(combined_df_tech)
# combined_benchmark.to_csv('healthcare_results_withbenchmark_new.csv')

In [6]:
individ_models = ['lasso', 'ridge', 'xgboost', 'lgbm', 'rf', 'pca', 'lstm']

def forecast_combi_fn(df, actual_col='rolling_alpha_5f'):
    df = df.reset_index()
    # Simple Average (Combination 1: Simple Average Forecast)
    df['simple_avg'] = df[individ_models].mean(axis=1)
    
    # Initialize a new column to store the final results
    df['err_weighted'] = np.nan

    # Loop through each row (Combination 2: Error-Weighted Forecast --> Bates-Granger combination)
    for idx, row in df.iterrows():
        if idx == 0:  # First row uses simple average
            df.at[idx, 'err_weighted'] = row[individ_models].mean()
        else: # Else construct error-weighted forecasts
            # Squared Error for each individual model
            sq_errors = ((df.loc[idx, individ_models] - df.loc[idx, actual_col]).abs())**2

            # Compute inverse weights for each individual model
            inv_weights = 1 / (sq_errors + 1e-6)  # Add small value to avoid division by zero

            # Normalize weights to sum to 1
            norm_weights = inv_weights / inv_weights.sum()

            # Compute the error-weighted forecast for each row
            weighted_forecast = (row[individ_models] * norm_weights).sum()

            df.at[idx, 'err_weighted'] = weighted_forecast

    return df

combined_forecastcombi_healthcare = forecast_combi_fn(combined_benchmark_healthcare)
combined_forecastcombi_tech = forecast_combi_fn(combined_benchmark_tech)

In [None]:
combined_forecastcombi_healthcare.to_csv('healthcare_with_combi_1.csv')
combined_forecastcombi_tech.to_csv('tech_with_combi_1.csv')