# Framework for predictions and portfolio forming

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

# import the parquet library
import pyarrow.parquet as pq

# import model libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# load 'basemodel.parquet'
#df = pd.read_parquet('basemodel.parquet')
df= pd.read_parquet('/kaggle/input/sign-prediction-datasets/clean_financials.parquet')
#df= pd.read_parquet('clean_financials.parquet')
prediction_cols = []
df.head()

Unnamed: 0,PERMNO,date,y,RET,LME,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,SPREAD_L,LTURNOVER,IVOL,IVOL2,IVOL3,STR,LTR,IMOM,MOM,PCTHIGH,MVOL,MVOL2,MVOL3,LMKT,LMKT2,LMKT3,MMOM,MIMOM,MLTR,release_L,LAT,DPI2A_L,CTO_L,D2A_L,NOA_L,OL_L,PCM_L,FC2Y_L,INVEST_L,RNA_L,S2E_L,PROF_L,PM_L,ATO_L,ROA_L,FCF_L,A2ME_L,B2ME_L,S2P_L,ROE_L,LEV_L,TQ_L
0,10000,1987-03-31,0,-0.384615,1581.53125,973.25,0.0,0.0,0.0,1.0,0,0,0.076923,0.100694,0.000612,0.00083,0.002413,0.0,-0.297252,-1.089044,-2.079441,0.091549,0.545027,0.771919,0.43419,0.0482,0.1289,-0.0278,0.186212,0.112764,0.45911,0.0,2.115,0.0,0.055319,0.037352,0.0,0.236407,-1.495726,1.777778,0.0,0.016564,0.279904,0.064593,0.230769,0.071779,-0.061939,-0.024586,0.001067,0.000211,5.9e-05,-8.3e-05,3.7e-05,748.571277
1,10000,1987-04-30,0,-0.0625,973.25,912.44134,0.0,0.0,0.0,0.0,0,1,0.625,0.285384,0.003465,0.000612,0.00083,-0.384615,0.014185,-1.459321,-2.390877,0.0625,0.672597,0.545027,0.771919,0.0211,0.0482,0.1289,0.179935,-0.025601,0.498953,0.0,2.115,0.0,0.055319,0.037352,0.0,0.236407,-1.495726,1.777778,0.0,0.016564,0.279904,0.064593,0.230769,0.071779,-0.061939,-0.024586,0.001067,0.000211,5.9e-05,-0.000135,6e-05,460.967849
2,10000,1987-05-31,0,-0.066667,912.44134,851.59375,0.0,0.0,0.0,0.0,0,2,0.13333,0.256358,0.001893,0.003465,0.000612,-0.0625,-0.089613,-1.633155,-2.772587,0.075378,1.565461,0.672597,0.545027,-0.0167,0.0211,0.0482,0.208747,0.032263,0.488026,1.0,2.115,-0.037825,0.074232,0.028309,-0.382033,0.293803,-1.101911,0.343949,-38.20331,-0.36864,-1.121429,2.535714,-2.261146,0.163032,-0.108274,-0.146901,0.00066,-7.1e-05,7.9e-05,-0.000251,1.8e-05,699.225968
3,10000,1987-06-30,0,0.0,851.59375,851.59375,0.0,0.0,0.0,0.0,0,0,0.071451,0.20935,0.00019,0.001893,0.003465,-0.066667,-0.341485,-1.323014,-2.585254,0.070707,0.955605,1.565461,0.672597,0.0049,-0.0167,0.0211,0.142069,-0.002095,0.591159,0.0,2.115,-0.037825,0.074232,0.028309,-0.382033,0.293803,-1.101911,0.343949,-38.20331,-0.36864,-1.121429,2.535714,-2.261146,0.163032,-0.108274,-0.146901,0.00066,-7.1e-05,7.9e-05,-0.000269,1.9e-05,652.670811
4,10005,1987-03-31,0,0.0,795.11688,795.11688,0.0,0.0,0.0,0.0,0,0,0.090936,0.016213,0.000532,0.021169,1.3e-05,0.1,-0.405465,-1.163152,-0.470004,0.578954,0.545027,0.771919,0.43419,0.0482,0.1289,-0.0278,0.186212,0.112764,0.45911,0.0,1.954,-0.001485,0.023762,0.015865,-0.032673,0.044012,0.645833,1.4375,-3.267327,-0.423313,0.025263,-0.036316,-1.4375,0.294479,-0.033168,-0.018424,0.004506,0.004381,0.000111,-8.4e-05,4.7e-05,406.945179


In [3]:
# select the columns to be used for prediction
X_col = ['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL', # initial columns
         'STR',	'LTURNOVER', 'IMOM', 'MOM',	'LTR', 'PCTHIGH', 'IVOL2', 'IVOL3', 'SPREAD_L',	# stock specific columns
         'MVOL', 'MVOL2', 'MVOL3', 'LMKT2',	'LMKT3', 'MMOM', 'MIMOM', 'MLTR', # market specific columns
         'LAT',	'DPI2A_L',	'CTO_L', 'D2A_L', 'NOA_L', 'OL_L',	'PCM_L', 'FC2Y_L',	
         'INVEST_L', 'RNA_L', 'S2E_L', 'PROF_L', 'PM_L', 'ATO_L', 'ROA_L', 	
         'FCF_L', 'A2ME_L',	'B2ME_L', 'S2P_L', 'ROE_L',	'LEV_L', 'TQ_L'] 

In [4]:
# scale the data for faster coeficient convergence
scaler = MinMaxScaler()

df[X_col] = scaler.fit_transform(df[X_col])

In [5]:
# Convert 'date' to datetime format (if not already done) and sort the DataFrame
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)
df.reset_index(drop=True, inplace=True)

# Create a 'year' column based on the 'date' column
df['year'] = df['date'].dt.year

# Paper Replication - OLS and Logit, Expanding Window - No Hyperparameters
- They start with out of sample forecasting in 1932
- models will be named model_default

### Linear Regression (Pooled OLS)

In [6]:
#################################
# OLS, default, exp window
#################################

model_name = 'ols_default'


# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

for year in range(df['year'].min() + 6, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define the training data up until this year
    train_data = df[df['year'] < year]
    
    X_train = train_data[X_col]
    y_train = train_data['y']
    
    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Time: {iteration_time:.2f} seconds")

Year 1976 - Time: 0.25 seconds
Year 1977 - Time: 0.37 seconds
Year 1978 - Time: 0.42 seconds
Year 1979 - Time: 0.49 seconds
Year 1980 - Time: 0.52 seconds
Year 1981 - Time: 0.57 seconds
Year 1982 - Time: 0.63 seconds
Year 1983 - Time: 0.71 seconds
Year 1984 - Time: 0.79 seconds
Year 1985 - Time: 0.95 seconds
Year 1986 - Time: 1.08 seconds
Year 1987 - Time: 1.20 seconds
Year 1988 - Time: 1.59 seconds
Year 1989 - Time: 1.73 seconds
Year 1990 - Time: 1.90 seconds
Year 1991 - Time: 2.05 seconds
Year 1992 - Time: 2.22 seconds
Year 1993 - Time: 2.37 seconds
Year 1994 - Time: 2.53 seconds
Year 1995 - Time: 3.15 seconds
Year 1996 - Time: 2.86 seconds
Year 1997 - Time: 3.13 seconds
Year 1998 - Time: 3.30 seconds
Year 1999 - Time: 3.47 seconds
Year 2000 - Time: 3.71 seconds
Year 2001 - Time: 3.91 seconds
Year 2002 - Time: 4.07 seconds
Year 2003 - Time: 4.36 seconds
Year 2004 - Time: 4.45 seconds
Year 2005 - Time: 4.47 seconds
Year 2006 - Time: 4.69 seconds
Year 2007 - Time: 4.75 seconds
Year 200

# My Experiments

## Machine Learning - Hyperparameter Tuning included in the process
- models to be named 'model_clas/reg_exp/roll'

### First expanding, then rolling
start predicting for 1932, expand the window until you reach X years, then roll it

##### 5 year window

In [7]:
# set the length of the rolling window
rolling_window = 5 # years

In [8]:
#################################
# RIDGE REGRESSION
#################################


model_name = 'ridge_reg_roll5'  # Name for storing Ridge regression predictions
start_time2 = time.time()  # Start timing


# Predefined set of alpha values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100]  # Alpha for Ridge

# Update the column name for storing Ridge regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 6


for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Prepare training and tuning datasets
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_accuracy = 0  # Initialize with 0 for accuracy comparison
    
    # Hyperparameter tuning with accuracy on top and bottom deciles
    for hp1 in HP1:
        model = Ridge(alpha=hp1)
        model.fit(X_train, y_train)
        predictions = model.predict(X_tune)
        
        # Identify top and bottom deciles
        decile_thresholds = np.percentile(predictions, [10, 90])
        top_bottom_decile_mask = (predictions <= decile_thresholds[0]) | (predictions >= decile_thresholds[1])
        
        # Filter tuning dataset based on deciles
        filtered_y_tune = y_tune[top_bottom_decile_mask]
        filtered_predictions = predictions[top_bottom_decile_mask]
        
        # Assuming binary classification, convert continuous predictions to binary
        # This conversion logic might need adjustment based on your specific use case
        binary_predictions = (filtered_predictions >= 0.5).astype(int)
        
        # Calculate accuracy for filtered predictions
        decile_accuracy = accuracy_score(filtered_y_tune, binary_predictions)
        
        if decile_accuracy > best_accuracy:
            best_accuracy = decile_accuracy
            best_HP1 = hp1
    
    # Retrain with best hyperparameters on the entire training data
    model = Ridge(alpha=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year and update df
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best Alpha: {best_HP1}, Best Decile Accuracy: {round(best_accuracy, 4)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()  # End timing
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1976 - Best Alpha: 0.1, Best Decile Accuracy: 0.9705, Time: 0.45 seconds
Year 1977 - Best Alpha: 100, Best Decile Accuracy: 0.9985, Time: 0.62 seconds
Year 1978 - Best Alpha: 1, Best Decile Accuracy: 0.9985, Time: 0.67 seconds
Year 1979 - Best Alpha: 10, Best Decile Accuracy: 0.998, Time: 0.68 seconds
Year 1980 - Best Alpha: 10, Best Decile Accuracy: 0.9977, Time: 0.71 seconds
Year 1981 - Best Alpha: 1, Best Decile Accuracy: 0.9974, Time: 0.67 seconds
Year 1982 - Best Alpha: 100, Best Decile Accuracy: 0.9956, Time: 0.67 seconds
Year 1983 - Best Alpha: 10, Best Decile Accuracy: 0.9946, Time: 0.66 seconds
Year 1984 - Best Alpha: 10, Best Decile Accuracy: 0.9973, Time: 0.69 seconds
Year 1985 - Best Alpha: 100, Best Decile Accuracy: 0.9928, Time: 0.77 seconds
Year 1986 - Best Alpha: 10, Best Decile Accuracy: 0.9968, Time: 0.82 seconds
Year 1987 - Best Alpha: 10, Best Decile Accuracy: 0.9973, Time: 0.94 seconds
Year 1988 - Best Alpha: 100, Best Decile Accuracy: 0.9912, Time: 1.03 secon

In [9]:
#################################
# LASSO REGRESSION
#################################


model_name = 'lasso_reg_roll5'  # Name for storing Ridge regression predictions
start_time2 = time.time()  # Start timing


# Predefined set of alpha values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100]  # Alpha for Ridge

# Update the column name for storing Ridge regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 6


for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Prepare training and tuning datasets
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_accuracy = 0  # Initialize with 0 for accuracy comparison
    
    # Hyperparameter tuning with accuracy on top and bottom deciles
    for hp1 in HP1:
        model = Lasso(alpha=hp1)
        model.fit(X_train, y_train)
        predictions = model.predict(X_tune)
        
        # Identify top and bottom deciles
        decile_thresholds = np.percentile(predictions, [10, 90])
        top_bottom_decile_mask = (predictions <= decile_thresholds[0]) | (predictions >= decile_thresholds[1])
        
        # Filter tuning dataset based on deciles
        filtered_y_tune = y_tune[top_bottom_decile_mask]
        filtered_predictions = predictions[top_bottom_decile_mask]
        
        # Assuming binary classification, convert continuous predictions to binary
        # This conversion logic might need adjustment based on your specific use case
        binary_predictions = (filtered_predictions >= 0.5).astype(int)
        
        # Calculate accuracy for filtered predictions
        decile_accuracy = accuracy_score(filtered_y_tune, binary_predictions)
        
        if decile_accuracy > best_accuracy:
            best_accuracy = decile_accuracy
            best_HP1 = hp1
    
    # Retrain with best hyperparameters on the entire training data
    model = Lasso(alpha=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year and update df
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best Alpha: {best_HP1}, Best Decile Accuracy: {round(best_accuracy, 4)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()  # End timing
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1976 - Best Alpha: 0.01, Best Decile Accuracy: 0.5396, Time: 0.42 seconds
Year 1977 - Best Alpha: 0.01, Best Decile Accuracy: 0.9996, Time: 0.75 seconds
Year 1978 - Best Alpha: 0.01, Best Decile Accuracy: 0.9988, Time: 0.97 seconds
Year 1979 - Best Alpha: 0.01, Best Decile Accuracy: 1.0, Time: 0.79 seconds
Year 1980 - Best Alpha: 0.01, Best Decile Accuracy: 0.9966, Time: 0.71 seconds
Year 1981 - Best Alpha: 0.01, Best Decile Accuracy: 0.9851, Time: 0.74 seconds
Year 1982 - Best Alpha: 0.01, Best Decile Accuracy: 0.9703, Time: 0.83 seconds
Year 1983 - Best Alpha: 0.01, Best Decile Accuracy: 0.9844, Time: 0.89 seconds
Year 1984 - Best Alpha: 0.01, Best Decile Accuracy: 0.9959, Time: 0.84 seconds
Year 1985 - Best Alpha: 0.01, Best Decile Accuracy: 0.9987, Time: 1.02 seconds
Year 1986 - Best Alpha: 0.01, Best Decile Accuracy: 0.9995, Time: 1.21 seconds
Year 1987 - Best Alpha: 0.01, Best Decile Accuracy: 1.0, Time: 1.01 seconds
Year 1988 - Best Alpha: 0.01, Best Decile Accuracy: 1.0, T

In [10]:
#################################
# DECISION TREE REGRESSION
#################################

model_name = 'DT_reg_roll5'  # Name for storing Decision Tree regression predictions

# Predefined set of values for hyperparameter tuning (max_depth in this case)
HP1 = [None, 5, 10, 15, 20, 25]  # Possible values for max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 6


for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_accuracy = -1  # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = DecisionTreeRegressor(max_depth=hp1)  # Decision Tree Regressor
        model.fit(X_train, y_train)
        predictions = model.predict(X_tune)
        
        # Identify top and bottom deciles
        decile_thresholds = np.percentile(predictions, [10, 90])
        top_bottom_decile_mask = (predictions <= decile_thresholds[0]) | (predictions >= decile_thresholds[1])
        
        # Filter tuning dataset based on deciles
        filtered_y_tune = y_tune[top_bottom_decile_mask]
        filtered_predictions = predictions[top_bottom_decile_mask]
        
        # Assuming binary classification, convert continuous predictions to binary
        # This conversion logic might need adjustment based on your specific use case
        binary_predictions = (filtered_predictions >= 0.5).astype(int)
        
        # Calculate accuracy for filtered predictions
        decile_accuracy = accuracy_score(filtered_y_tune, binary_predictions)
        
        if decile_accuracy > best_accuracy:
            best_accuracy = decile_accuracy
            best_HP1 = hp1
    
    # Retrain on the entire training window (excluding tuning year) with the best max_depth value
    model = DecisionTreeRegressor(max_depth=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)  # Predict continuous values
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best max_depth: {best_HP1}, Best ACC: {round(best_accuracy,4)}, Time: {iteration_time:.2f} seconds")


Year 1976 - Best max_depth: 5, Best ACC: 1.0, Time: 6.07 seconds
Year 1977 - Best max_depth: 5, Best ACC: 1.0, Time: 9.12 seconds
Year 1978 - Best max_depth: 5, Best ACC: 1.0, Time: 10.51 seconds
Year 1979 - Best max_depth: 5, Best ACC: 1.0, Time: 10.80 seconds
Year 1980 - Best max_depth: 5, Best ACC: 1.0, Time: 10.85 seconds
Year 1981 - Best max_depth: 5, Best ACC: 1.0, Time: 10.91 seconds
Year 1982 - Best max_depth: 5, Best ACC: 1.0, Time: 10.36 seconds
Year 1983 - Best max_depth: 5, Best ACC: 1.0, Time: 10.34 seconds
Year 1984 - Best max_depth: 5, Best ACC: 1.0, Time: 9.75 seconds
Year 1985 - Best max_depth: 5, Best ACC: 1.0, Time: 11.45 seconds
Year 1986 - Best max_depth: 5, Best ACC: 1.0, Time: 14.37 seconds
Year 1987 - Best max_depth: 5, Best ACC: 1.0, Time: 16.84 seconds
Year 1988 - Best max_depth: 5, Best ACC: 1.0, Time: 19.90 seconds
Year 1989 - Best max_depth: 5, Best ACC: 1.0, Time: 20.75 seconds
Year 1990 - Best max_depth: 5, Best ACC: 1.0, Time: 21.40 seconds
Year 1991 - B

In [11]:
#################################
# RANDOM FOREST REGRESSION
#################################
start_time2 = time.time()  # Start timing

model_name = 'RF_reg_roll5'  # Name for storing Random Forest regression predictions

# Predefined set of values for hyperparameter tuning
HP1 = [10, 50, 100, 200]  # Possible values for n_estimators
HP2 = [5, 10, 15, None]  # Possible values for max_depth

# Update the column name for storing Random Forest regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 6


for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_HP2 = None
    best_accuracy = -1
    
    # Hyperparameter tuning
    for hp1 in HP1:
        for hp2 in HP2:
            model = RandomForestRegressor(n_estimators=hp1, max_depth=hp2, random_state=42, n_jobs = -1)
            model.fit(X_train, y_train)
            predictions = model.predict(X_tune)  # Predict continuous values
            
            # Identify top and bottom deciles
            decile_thresholds = np.percentile(predictions, [10, 90])
            top_bottom_decile_mask = (predictions <= decile_thresholds[0]) | (predictions >= decile_thresholds[1])

            # Filter tuning dataset based on deciles
            filtered_y_tune = y_tune[top_bottom_decile_mask]
            filtered_predictions = predictions[top_bottom_decile_mask]

            # Assuming binary classification, convert continuous predictions to binary
            # This conversion logic might need adjustment based on your specific use case
            binary_predictions = (filtered_predictions >= 0.5).astype(int)

            # Calculate accuracy for filtered predictions
            decile_accuracy = accuracy_score(filtered_y_tune, binary_predictions)

            if decile_accuracy > best_accuracy:
                best_accuracy = decile_accuracy
                best_HP1 = hp1
                best_HP2 = hp2
    
    # Retrain on the entire training window (excluding tuning year) with the best hyperparameters
    model = RandomForestRegressor(n_estimators=best_HP1, max_depth=best_HP2, random_state=42, n_jobs = -1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)  # Predict continuous values
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best n_estimators: {best_HP1}, Best max_depth: {best_HP2}, Best ACC: {round(best_accuracy,2)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1976 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 286.59 seconds
Year 1977 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 428.33 seconds
Year 1978 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 490.24 seconds
Year 1979 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 498.36 seconds
Year 1980 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 503.63 seconds
Year 1981 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 504.71 seconds
Year 1982 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 474.35 seconds
Year 1983 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 467.95 seconds
Year 1984 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 453.91 seconds
Year 1985 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 530.67 seconds
Year 1986 - Best n_estimators: 10, Best max_depth: 5, Best ACC: 1.0, Time: 651.14 seconds
Year 1987 

## Forming Portfolios, Value-weighted portfolio returns

In [12]:
df.head()

Unnamed: 0,PERMNO,date,y,RET,LME,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,SPREAD_L,LTURNOVER,IVOL,IVOL2,IVOL3,STR,LTR,IMOM,MOM,PCTHIGH,MVOL,MVOL2,MVOL3,LMKT,LMKT2,LMKT3,MMOM,MIMOM,MLTR,release_L,LAT,DPI2A_L,CTO_L,D2A_L,NOA_L,OL_L,PCM_L,FC2Y_L,INVEST_L,RNA_L,S2E_L,PROF_L,PM_L,ATO_L,ROA_L,FCF_L,A2ME_L,B2ME_L,S2P_L,ROE_L,LEV_L,TQ_L,year,ols_default,ridge_reg_roll5,lasso_reg_roll5,DT_reg_roll5,RF_reg_roll5
0,31464,1970-04-30,0,-0.102941,41412.0,37149.0,0.0,0.0,0.0,0.166667,0.0,0.416667,0.003421,6e-06,4.6e-05,3.4e-05,4.7e-05,0.039638,0.731272,0.61217,0.670081,0.413984,0.012852,0.012583,0.01042,0.564331,0.723312,0.385732,0.408401,0.496787,0.647683,1.0,0.000119,0.006682,0.000433,0.000149,0.000386,0.00062,0.907359,0.105362,0.000386,0.261588,0.217478,0.304686,0.95635,0.008189,0.785605,0.141009,0.001061,0.797682,0.010881,0.522705,0.000224,1.6e-05,1970,,,,,
1,31464,1970-05-31,0,-0.065574,37149.0,34713.0,0.0,0.0,0.0,0.0,0.0,0.5,0.004576,6e-06,1.3e-05,4.6e-05,3.4e-05,0.035635,0.722584,0.625955,0.672735,0.400651,0.017869,0.012852,0.012583,0.309299,0.564331,0.723312,0.385646,0.535558,0.630483,0.0,0.000119,0.006682,0.000433,0.000149,0.000386,0.00062,0.907359,0.105362,0.000386,0.261588,0.217478,0.304686,0.95635,0.008189,0.785605,0.141009,0.001061,0.797682,0.010881,0.522707,0.00025,1.4e-05,1970,,,,,
2,31464,1970-06-30,0,-0.14386,34713.0,29232.0,0.0,0.0,0.0,0.25,0.0,0.583333,0.00653,8e-06,6.2e-05,1.3e-05,4.6e-05,0.037131,0.721443,0.622612,0.670797,0.455396,0.127382,0.017869,0.012852,0.414013,0.309299,0.564331,0.279599,0.494936,0.666119,0.0,0.000119,0.006682,0.000433,0.000149,0.000386,0.00062,0.907359,0.105362,0.000386,0.261588,0.217478,0.304686,0.95635,0.008189,0.785605,0.141009,0.001061,0.797682,0.010881,0.522708,0.000267,1.3e-05,1970,,,,,
3,31464,1970-07-31,0,-0.125,29232.0,25578.0,0.0,0.083333,0.0,0.0,0.0,0.666667,0.004847,4e-06,2e-05,6.2e-05,1.3e-05,0.033998,0.698329,0.624735,0.678001,0.447986,0.035887,0.127382,0.017869,0.444076,0.414013,0.309299,0.282368,0.548597,0.589858,1.0,0.000119,0.006682,0.000424,0.000149,0.000386,0.00062,0.907347,0.105364,0.000386,0.261588,0.217477,0.304686,0.956351,0.008181,0.785544,0.140986,0.001061,0.797682,0.010711,0.522684,0.000316,1.1e-05,1970,,,,,
4,31464,1970-08-31,1,0.047619,25578.0,26796.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003877,5e-06,4.7e-05,2e-05,6.2e-05,0.034753,0.687449,0.638067,0.679698,0.391849,0.023573,0.035887,0.127382,0.766624,0.444076,0.414013,0.294738,0.536452,0.498298,0.0,0.000119,0.006682,0.000424,0.000149,0.000386,0.00062,0.907347,0.105364,0.000386,0.261588,0.217477,0.304686,0.956351,0.008181,0.785544,0.140986,0.001061,0.797682,0.010711,0.522684,0.000361,1e-05,1970,,,,,


In [13]:
prediction_cols
# prediction_cols = ['logit_default','OLS_default','logit_roll6','DT_reg_roll']

['ols_default',
 'ridge_reg_roll5',
 'lasso_reg_roll5',
 'DT_reg_roll5',
 'RF_reg_roll5']

In [14]:
portfolio = df[['date', 'RET', 'ME', 'y'] + prediction_cols].copy()
portfolio['date'] = pd.to_datetime(portfolio['date'])

# drop rows with missing values
portfolio.dropna(inplace=True)

portfolio.head()

Unnamed: 0,date,RET,ME,y,ols_default,ridge_reg_roll5,lasso_reg_roll5,DT_reg_roll5,RF_reg_roll5
80751,1976-01-31,0.147541,99391.25,1,0.541998,0.540911,0.434048,0.846724,0.835919
80752,1976-01-31,0.266187,87802.0,1,0.657648,0.749225,0.434048,0.954098,0.937994
80753,1976-01-31,0.124031,113172.5,1,0.652779,0.710956,0.447411,1.0,1.0
80754,1976-01-31,0.319444,12801.25,1,1.560148,1.787639,0.474138,1.0,1.0
80755,1976-01-31,0.0,8228.5,0,0.383483,0.394166,0.434048,0.0,0.0


In [15]:
portfolio.tail()

Unnamed: 0,date,RET,ME,y,ols_default,ridge_reg_roll5,lasso_reg_roll5,DT_reg_roll5,RF_reg_roll5
1795158,2022-07-31,-0.211268,360376.8,0,0.456078,0.683612,0.497137,0.0,0.0
1795159,2022-07-31,0.020794,956286.0,1,0.816904,0.921402,0.665507,1.0,1.0
1795160,2022-07-31,0.066548,921952.8,1,0.700782,0.761772,0.521968,0.993929,0.991815
1795161,2022-07-31,0.087452,121602.8,1,0.9006,0.914907,0.521968,0.993929,0.988957
1795162,2022-07-31,0.323765,931110600.0,1,0.814605,0.87609,0.521968,0.993929,0.993056


In [16]:
# Initialize an empty DataFrame to store value-weighted returns for each model
vwreturns = pd.DataFrame(portfolio['date'].unique(), columns=['date'])  # Ensures all dates are included

for pred_col in prediction_cols:
    # Calculate deciles for this prediction
    decile_col = f'decile_{pred_col}'
    portfolio[decile_col] = portfolio.groupby(['date'])[pred_col].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    
    # Determine position based on deciles
    position_col = f'position_{pred_col}'
    portfolio[position_col] = np.where(portfolio[decile_col] == 9, 1, np.where(portfolio[decile_col] == 0, -1, 0))
    
    # Calculate the value-weighted return for this prediction
    vwret_col = f'vwreturn_{pred_col}'
    vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
    
    # Merge the temporary value-weighted returns with the main vwreturns DataFrame
    vwreturns = vwreturns.merge(vwreturns_temp, on='date', how='left')

# Ensure the 'date' column is the first column and is sorted
vwreturns = vwreturns.sort_values('date').reset_index(drop=True)


  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)


In [17]:
vwreturns.head()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_reg_roll5,vwreturn_lasso_reg_roll5,vwreturn_DT_reg_roll5,vwreturn_RF_reg_roll5
0,1976-01-31,0.030443,0.032248,-0.085951,-0.001281,-0.001266
1,1976-02-29,0.015261,0.014575,0.00884,-0.009918,-0.009918
2,1976-03-31,0.010497,0.010523,0.005309,0.005947,0.00893
3,1976-04-30,0.010684,0.006683,0.002828,0.026945,0.026956
4,1976-05-31,0.020038,0.019607,0.005792,0.025408,0.025408


### Compare to market data

In [18]:
#market = pd.read_csv('FF3_clean.csv')
market = pd.read_csv('/kaggle/input/sign-prediction-datasets/FF3_clean.csv')

In [19]:
market.head()

Unnamed: 0,date,Mkt-RF,SMB,HML,RF
0,1926-07-31,2.96,-2.56,-2.43,0.22
1,1926-08-31,2.64,-1.17,3.82,0.25
2,1926-09-30,0.36,-1.4,0.13,0.23
3,1926-10-31,-3.24,-0.09,0.7,0.32
4,1926-11-30,2.53,-0.1,-0.51,0.31


In [20]:
# create a new 'Mkt' which is a sum of Mkt-RF and RF
market['Mkt'] = market['Mkt-RF'] + market['RF']

# divide all columns by 100 except 'date'
market.iloc[:, 1:] = market.iloc[:, 1:] / 100

#set the 'date' column to datetime format
market['date'] = pd.to_datetime(market['date'])

# merge the market data (only date and Mkt columns) with the vwreturns DataFrame
vwreturns = vwreturns.merge(market[['date', 'Mkt']], on='date', how='left')

# transform all columns (except 'date') to a log: log(x+1) and save the result as lvwreturns
lvwreturns = vwreturns.copy()
lvwreturns.iloc[:, 1:] = np.log(vwreturns.iloc[:, 1:] + 1)

In [21]:
vwreturns.head()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_reg_roll5,vwreturn_lasso_reg_roll5,vwreturn_DT_reg_roll5,vwreturn_RF_reg_roll5,Mkt
0,1976-01-31,0.030443,0.032248,-0.085951,-0.001281,-0.001266,0.1263
1,1976-02-29,0.015261,0.014575,0.00884,-0.009918,-0.009918,0.0066
2,1976-03-31,0.010497,0.010523,0.005309,0.005947,0.00893,0.0272
3,1976-04-30,0.010684,0.006683,0.002828,0.026945,0.026956,-0.0107
4,1976-05-31,0.020038,0.019607,0.005792,0.025408,0.025408,-0.0097


In [22]:
lvwreturns.head()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_reg_roll5,vwreturn_lasso_reg_roll5,vwreturn_DT_reg_roll5,vwreturn_RF_reg_roll5,Mkt
0,1976-01-31,0.029989,0.031739,-0.089872,-0.001282,-0.001266,0.118938
1,1976-02-29,0.015146,0.014469,0.008801,-0.009967,-0.009967,0.006578
2,1976-03-31,0.010442,0.010468,0.005295,0.00593,0.008891,0.026837
3,1976-04-30,0.010628,0.006661,0.002824,0.026588,0.026599,-0.010758
4,1976-05-31,0.01984,0.019417,0.005775,0.025091,0.025091,-0.009747


In [23]:
lvwreturns.describe()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_reg_roll5,vwreturn_lasso_reg_roll5,vwreturn_DT_reg_roll5,vwreturn_RF_reg_roll5,Mkt
count,559,559.0,559.0,559.0,559.0,559.0,559.0
mean,1999-05-01 04:22:45.295169920,0.016571,0.016952,0.00461,0.021938,0.023459,0.009303
min,1976-01-31 00:00:00,-0.020252,-0.031658,-0.089872,-0.054603,-0.050184,-0.2567
25%,1987-09-15 00:00:00,0.010928,0.011345,0.002202,0.007739,0.009082,-0.015672
50%,1999-04-30 00:00:00,0.014409,0.014991,0.003609,0.015368,0.016978,0.013804
75%,2010-12-15 12:00:00,0.019833,0.02055,0.006425,0.029013,0.030096,0.038162
max,2022-07-31 00:00:00,0.077408,0.077671,0.031222,0.207532,0.207532,0.127953
std,,0.00866,0.008958,0.005997,0.023857,0.02267,0.045169


In [24]:
# save the lvwreturns and portfolio DataFrame to a parquet file into 'outputs' folder

# for reproducibility and visualization purposes
lvwreturns.to_parquet('fin_lvwreturns_reg1.parquet')
portfolio.to_parquet('fin_portfolio_reg1.parquet')

# save vwreturns DataFrame to a .dta file into 'outputs' folder
#vwreturns.to_stata('outputs/vwreturns.dta') # for backtasting in R - we need normal returns, not log returns
