# Framework for predictions and portfolio forming

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

# import the parquet library
import pyarrow.parquet as pq

# import model libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# load 'basemodel.parquet'
#df = pd.read_parquet('basemodel.parquet')
df = pd.read_parquet('/kaggle/input/sign-prediction-datasets/basemodel.parquet')
prediction_cols = []
df.head()

Unnamed: 0,PERMNO,date,RET,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,LMKT,IVOL,y
0,10000,1986-02-28,-0.257143,11960.0,0.0,0.0,0.0,1.0,0,0,0.0121,0.004058,0
1,10000,1986-03-31,0.365385,16330.0,0.0,0.0,0.0,0.0,0,0,0.0766,0.000973,1
2,10000,1986-04-30,-0.098592,15172.0,0.0,1.0,0.0,0.0,0,0,0.0548,0.001993,0
3,10000,1986-05-31,-0.222656,11793.87834,0.0,0.0,0.0,0.0,0,1,-0.0079,0.000163,0
4,10000,1986-06-30,-0.005025,11734.59375,0.0,0.0,0.0,0.0,0,2,0.0511,0.001569,0


In [3]:
# select the columns to be used for prediction
X_col = ['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL']

In [4]:
# Convert 'date' to datetime format (if not already done) and sort the DataFrame
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)
df.reset_index(drop=True, inplace=True)

# Create a 'year' column based on the 'date' column
df['year'] = df['date'].dt.year

# Paper Replication - OLS and Logit, Expanding Window - No Hyperparameters
- They start with out of sample forecasting in 1932
- models will be named model_default

### Linear Regression (Pooled OLS)

In [5]:
#################################
# OLS, default, exp window
#################################

model_name = 'ols_default'


# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

for year in range(df['year'].min() + 6, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define the training data up until this year
    train_data = df[df['year'] < year]
    
    X_train = train_data[X_col]
    y_train = train_data['y']
    
    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Time: {iteration_time:.2f} seconds")

Year 1932 - Time: 0.06 seconds
Year 1933 - Time: 0.05 seconds
Year 1934 - Time: 0.05 seconds
Year 1935 - Time: 0.05 seconds
Year 1936 - Time: 0.07 seconds
Year 1937 - Time: 0.06 seconds
Year 1938 - Time: 0.06 seconds
Year 1939 - Time: 0.06 seconds
Year 1940 - Time: 0.07 seconds
Year 1941 - Time: 0.07 seconds
Year 1942 - Time: 0.07 seconds
Year 1943 - Time: 0.08 seconds
Year 1944 - Time: 0.08 seconds
Year 1945 - Time: 0.07 seconds
Year 1946 - Time: 0.08 seconds
Year 1947 - Time: 0.08 seconds
Year 1948 - Time: 0.11 seconds
Year 1949 - Time: 0.11 seconds
Year 1950 - Time: 0.09 seconds
Year 1951 - Time: 0.14 seconds
Year 1952 - Time: 0.10 seconds
Year 1953 - Time: 0.10 seconds
Year 1954 - Time: 0.13 seconds
Year 1955 - Time: 0.11 seconds
Year 1956 - Time: 0.12 seconds
Year 1957 - Time: 0.13 seconds
Year 1958 - Time: 0.14 seconds
Year 1959 - Time: 0.16 seconds
Year 1960 - Time: 0.14 seconds
Year 1961 - Time: 0.16 seconds
Year 1962 - Time: 0.15 seconds
Year 1963 - Time: 0.15 seconds
Year 196

In [6]:
df.tail()

Unnamed: 0,PERMNO,date,RET,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,LMKT,IVOL,y,year,ols_default
3627748,14436,2022-07-31,-0.106939,3731571.0,2.0,0.0,0.0,0.0,0,0,-0.0837,0.001066,0,2022,0.626592
3627749,14435,2022-07-31,0.020969,5066.263,0.0,0.0,0.0,0.0,0,0,-0.0837,0.003461,1,2022,0.526688
3627750,14434,2022-07-31,0.184785,1280287.0,0.0,0.0,3.0,0.0,0,0,-0.0837,0.000616,1,2022,0.799235
3627751,14431,2022-07-31,0.108881,34757.11,0.0,1.0,0.0,0.0,0,0,-0.0837,0.004084,1,2022,0.493767
3627752,93436,2022-07-31,0.323765,931110600.0,2.0,0.0,1.0,0.0,0,0,-0.0837,0.000625,1,2022,0.717418


# My Experiments

## Machine Learning - Hyperparameter Tuning included in the process
- models to be named 'model_clas/reg_exp/roll'

### Expanding Window Estimation

##### MSE as evaluation metric

In [7]:
###################################
# RIDGE CLASSIFICATION MODEL  - MSE
###################################

model_name = 'ridge_clas_exp_MSE'  # Define the model name for Ridge

# Update the column name for storing Ridge classification predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the regularization strengths to test
HP1 = [0.01, 0.1, 1, 10, 100]  # Regularization strength C

for year in range(df['year'].min() + 7, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define training and tuning datasets
    train_data = df[df['year'] < year]
    tuning_data = train_data[train_data['year'] == year - 1]
    train_data = train_data[train_data['year'] < year - 1]
    
    X_train = train_data[X_col]
    y_train = train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = float('inf')  # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = LogisticRegression(C=hp1, max_iter=1000, penalty='l2')  # Ridge
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training dataset up to the current year
    X_full_train = df[df['year'] < year][X_col]
    y_full_train = df[df['year'] < year]['y']
    model = LogisticRegression(C=best_HP1, max_iter=1000, penalty='l2')  # Ridge
    model.fit(X_full_train, y_full_train)
    
    # Predict probabilities for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best C: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")


Year 1933 - Best C: 0.1, Best MSE: 0.0988, Time: 0.59 seconds
Year 1934 - Best C: 100, Best MSE: 0.0854, Time: 0.65 seconds
Year 1935 - Best C: 100, Best MSE: 0.101, Time: 0.79 seconds
Year 1936 - Best C: 0.1, Best MSE: 0.1083, Time: 0.84 seconds
Year 1937 - Best C: 100, Best MSE: 0.0996, Time: 0.82 seconds
Year 1938 - Best C: 100, Best MSE: 0.0783, Time: 1.04 seconds
Year 1939 - Best C: 100, Best MSE: 0.0948, Time: 1.21 seconds
Year 1940 - Best C: 100, Best MSE: 0.0879, Time: 1.41 seconds
Year 1941 - Best C: 100, Best MSE: 0.106, Time: 1.38 seconds
Year 1942 - Best C: 100, Best MSE: 0.1095, Time: 1.54 seconds
Year 1943 - Best C: 0.1, Best MSE: 0.1078, Time: 1.59 seconds
Year 1944 - Best C: 100, Best MSE: 0.0938, Time: 1.54 seconds
Year 1945 - Best C: 100, Best MSE: 0.107, Time: 1.84 seconds
Year 1946 - Best C: 100, Best MSE: 0.0807, Time: 1.89 seconds
Year 1947 - Best C: 100, Best MSE: 0.0834, Time: 2.07 seconds
Year 1948 - Best C: 100, Best MSE: 0.103, Time: 2.18 seconds
Year 1949 - 

In [8]:
###################################
# DT CLASSIFICATION MODEL  - MSE
###################################

model_name = 'DT_clas_exp_MSE'  # Define the model name for Ridge

# Update the column name for storing Ridge classification predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

HP1 = [5, 10, 15, 20, 25, None]  # Max depth of the tree

for year in range(df['year'].min() + 7, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define training and tuning datasets
    train_data = df[df['year'] < year]
    tuning_data = train_data[train_data['year'] == year - 1]
    train_data = train_data[train_data['year'] < year - 1]
    
    X_train = train_data[X_col]
    y_train = train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = float('inf')  # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = DecisionTreeClassifier(max_depth=hp1)
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training dataset up to the current year
    X_full_train = df[df['year'] < year][X_col]
    y_full_train = df[df['year'] < year]['y']
    model = DecisionTreeClassifier(max_depth=best_HP1)
    model.fit(X_full_train, y_full_train)
    
    # Predict probabilities for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best Max Depth: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")

Year 1933 - Best Max Depth: 10, Best MSE: 0.0955, Time: 0.59 seconds
Year 1934 - Best Max Depth: 10, Best MSE: 0.0928, Time: 0.68 seconds
Year 1935 - Best Max Depth: 10, Best MSE: 0.108, Time: 0.80 seconds
Year 1936 - Best Max Depth: 10, Best MSE: 0.1102, Time: 0.98 seconds
Year 1937 - Best Max Depth: 10, Best MSE: 0.0984, Time: 1.11 seconds
Year 1938 - Best Max Depth: 10, Best MSE: 0.0788, Time: 1.25 seconds
Year 1939 - Best Max Depth: 10, Best MSE: 0.085, Time: 1.41 seconds
Year 1940 - Best Max Depth: 5, Best MSE: 0.0868, Time: 1.47 seconds
Year 1941 - Best Max Depth: 10, Best MSE: 0.1144, Time: 1.70 seconds
Year 1942 - Best Max Depth: 10, Best MSE: 0.1177, Time: 1.88 seconds
Year 1943 - Best Max Depth: 10, Best MSE: 0.1106, Time: 2.10 seconds
Year 1944 - Best Max Depth: 5, Best MSE: 0.0965, Time: 2.18 seconds
Year 1945 - Best Max Depth: 10, Best MSE: 0.1117, Time: 2.42 seconds
Year 1946 - Best Max Depth: 10, Best MSE: 0.0806, Time: 2.56 seconds
Year 1947 - Best Max Depth: 10, Best M

### First expanding, then rolling
start predicting for 1932, expand the window until you reach X years, then roll it

#### MSE Evaluation

##### 5 years

In [9]:
rolling_window = 5

In [10]:
############################################
# RIDGE CLASSIFICATION MODEL - MSE
############################################

model_name = 'ridge_clas_roll5_MSE'  # Name of the new column for storing predictions

# Predefined set of C values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100] # C

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 7

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
       
    best_HP1 = None
    best_mse = float('inf')  # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = LogisticRegression(C=hp1, max_iter=1000, penalty='l2')  # Ridge
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    
    # Retrain on the entire training window (excluding tuning year) with the best C value
    model = LogisticRegression(C=best_HP1, max_iter=1000, penalty='l2')
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best C: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")

Year 1933 - Best C: 0.1, Best MSE: 0.0985, Time: 0.45 seconds
Year 1934 - Best C: 100, Best MSE: 0.0866, Time: 0.44 seconds
Year 1935 - Best C: 100, Best MSE: 0.1018, Time: 0.44 seconds
Year 1936 - Best C: 0.1, Best MSE: 0.1087, Time: 0.41 seconds
Year 1937 - Best C: 100, Best MSE: 0.1009, Time: 0.43 seconds
Year 1938 - Best C: 1, Best MSE: 0.0789, Time: 0.50 seconds
Year 1939 - Best C: 100, Best MSE: 0.0875, Time: 0.63 seconds
Year 1940 - Best C: 100, Best MSE: 0.0788, Time: 0.73 seconds
Year 1941 - Best C: 100, Best MSE: 0.1071, Time: 0.57 seconds
Year 1942 - Best C: 0.1, Best MSE: 0.1098, Time: 0.93 seconds
Year 1943 - Best C: 0.1, Best MSE: 0.1077, Time: 0.57 seconds
Year 1944 - Best C: 100, Best MSE: 0.0899, Time: 0.72 seconds
Year 1945 - Best C: 100, Best MSE: 0.1057, Time: 0.69 seconds
Year 1946 - Best C: 100, Best MSE: 0.0781, Time: 0.67 seconds
Year 1947 - Best C: 100, Best MSE: 0.0825, Time: 0.66 seconds
Year 1948 - Best C: 1, Best MSE: 0.1029, Time: 0.62 seconds
Year 1949 - 

In [11]:
############################################
# DECISION TREE CLASSIFICATION MODEL
############################################

model_name = 'DT_class_roll5_MSE'


# Predefined set of max_depth values for hyperparameter tuning
HP1 = [3, 5, 10, 15, 25, None] # max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 7

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = float('inf')
    
    # Hyperparameter tuning for max_depth
    for hp1 in HP1:
        model = DecisionTreeClassifier(max_depth=hp1)
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training window with the best max_depth value
    model = DecisionTreeClassifier(max_depth=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best Max Depth: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")


Year 1933 - Best Max Depth: 10, Best MSE: 0.102, Time: 0.38 seconds
Year 1934 - Best Max Depth: 10, Best MSE: 0.0859, Time: 0.39 seconds
Year 1935 - Best Max Depth: 5, Best MSE: 0.1044, Time: 0.38 seconds
Year 1936 - Best Max Depth: 5, Best MSE: 0.1394, Time: 0.39 seconds
Year 1937 - Best Max Depth: 10, Best MSE: 0.1047, Time: 0.41 seconds
Year 1938 - Best Max Depth: 10, Best MSE: 0.0823, Time: 0.43 seconds
Year 1939 - Best Max Depth: 10, Best MSE: 0.098, Time: 0.42 seconds
Year 1940 - Best Max Depth: 5, Best MSE: 0.0988, Time: 0.41 seconds
Year 1941 - Best Max Depth: 10, Best MSE: 0.1255, Time: 0.46 seconds
Year 1942 - Best Max Depth: 5, Best MSE: 0.1274, Time: 0.44 seconds
Year 1943 - Best Max Depth: 5, Best MSE: 0.1321, Time: 0.45 seconds
Year 1944 - Best Max Depth: 5, Best MSE: 0.0975, Time: 0.46 seconds
Year 1945 - Best Max Depth: 10, Best MSE: 0.1192, Time: 0.47 seconds
Year 1946 - Best Max Depth: 5, Best MSE: 0.0856, Time: 0.43 seconds
Year 1947 - Best Max Depth: 5, Best MSE: 0.

##### 10 years

In [12]:
rolling_window = 10

In [13]:
############################################
# RIDGE CLASSIFICATION MODEL - MSE
############################################

model_name = 'ridge_clas_roll10_MSE'  # Name of the new column for storing predictions

# Predefined set of C values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100] # C

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 7

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
       
    best_HP1 = None
    best_mse = float('inf')  # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = LogisticRegression(C=hp1, max_iter=1000, penalty='l2')  # Ridge
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    
    # Retrain on the entire training window (excluding tuning year) with the best C value
    model = LogisticRegression(C=best_HP1, max_iter=1000, penalty='l2')
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best C: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")

Year 1933 - Best C: 0.1, Best MSE: 0.0988, Time: 0.57 seconds
Year 1934 - Best C: 100, Best MSE: 0.0854, Time: 0.64 seconds
Year 1935 - Best C: 100, Best MSE: 0.101, Time: 0.73 seconds
Year 1936 - Best C: 0.1, Best MSE: 0.1083, Time: 0.77 seconds
Year 1937 - Best C: 100, Best MSE: 0.0997, Time: 1.13 seconds
Year 1938 - Best C: 100, Best MSE: 0.0782, Time: 1.15 seconds
Year 1939 - Best C: 100, Best MSE: 0.0971, Time: 0.88 seconds
Year 1940 - Best C: 100, Best MSE: 0.0891, Time: 0.94 seconds
Year 1941 - Best C: 100, Best MSE: 0.106, Time: 0.99 seconds
Year 1942 - Best C: 100, Best MSE: 0.1095, Time: 1.09 seconds
Year 1943 - Best C: 0.1, Best MSE: 0.1076, Time: 1.03 seconds
Year 1944 - Best C: 100, Best MSE: 0.0907, Time: 1.22 seconds
Year 1945 - Best C: 100, Best MSE: 0.1042, Time: 1.41 seconds
Year 1946 - Best C: 100, Best MSE: 0.0766, Time: 1.22 seconds
Year 1947 - Best C: 100, Best MSE: 0.0814, Time: 1.33 seconds
Year 1948 - Best C: 1, Best MSE: 0.1024, Time: 1.36 seconds
Year 1949 - 

In [14]:
############################################
# DECISION TREE CLASSIFICATION MODEL
############################################

model_name = 'DT_class_roll10_MSE'


# Predefined set of max_depth values for hyperparameter tuning
HP1 = [3, 5, 10, 15, 25, None] # max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 7

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = float('inf')
    
    # Hyperparameter tuning for max_depth
    for hp1 in HP1:
        model = DecisionTreeClassifier(max_depth=hp1)
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training window with the best max_depth value
    model = DecisionTreeClassifier(max_depth=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best Max Depth: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")


Year 1933 - Best Max Depth: 10, Best MSE: 0.0949, Time: 0.52 seconds
Year 1934 - Best Max Depth: 10, Best MSE: 0.0934, Time: 0.61 seconds
Year 1935 - Best Max Depth: 10, Best MSE: 0.108, Time: 0.71 seconds
Year 1936 - Best Max Depth: 10, Best MSE: 0.11, Time: 0.83 seconds
Year 1937 - Best Max Depth: 10, Best MSE: 0.1001, Time: 0.95 seconds
Year 1938 - Best Max Depth: 5, Best MSE: 0.0806, Time: 0.91 seconds
Year 1939 - Best Max Depth: 10, Best MSE: 0.0961, Time: 0.99 seconds
Year 1940 - Best Max Depth: 5, Best MSE: 0.087, Time: 0.96 seconds
Year 1941 - Best Max Depth: 10, Best MSE: 0.1168, Time: 1.03 seconds
Year 1942 - Best Max Depth: 5, Best MSE: 0.1227, Time: 0.99 seconds
Year 1943 - Best Max Depth: 5, Best MSE: 0.1165, Time: 0.98 seconds
Year 1944 - Best Max Depth: 5, Best MSE: 0.0973, Time: 1.00 seconds
Year 1945 - Best Max Depth: 15, Best MSE: 0.1153, Time: 1.12 seconds
Year 1946 - Best Max Depth: 10, Best MSE: 0.0817, Time: 1.07 seconds
Year 1947 - Best Max Depth: 10, Best MSE: 0

##### 20 years

In [15]:
rolling_window = 20

In [16]:
############################################
# RIDGE CLASSIFICATION MODEL - MSE
############################################

model_name = 'ridge_clas_roll20_MSE'  # Name of the new column for storing predictions

# Predefined set of C values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100] # C

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 7

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
       
    best_HP1 = None
    best_mse = float('inf')  # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = LogisticRegression(C=hp1, max_iter=1000, penalty='l2')  # Ridge
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    
    # Retrain on the entire training window (excluding tuning year) with the best C value
    model = LogisticRegression(C=best_HP1, max_iter=1000, penalty='l2')
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best C: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")

Year 1933 - Best C: 0.1, Best MSE: 0.0988, Time: 0.56 seconds
Year 1934 - Best C: 100, Best MSE: 0.0854, Time: 0.64 seconds
Year 1935 - Best C: 100, Best MSE: 0.101, Time: 0.75 seconds
Year 1936 - Best C: 0.1, Best MSE: 0.1083, Time: 0.85 seconds
Year 1937 - Best C: 100, Best MSE: 0.0996, Time: 0.88 seconds
Year 1938 - Best C: 100, Best MSE: 0.0783, Time: 0.97 seconds
Year 1939 - Best C: 100, Best MSE: 0.0948, Time: 1.16 seconds
Year 1940 - Best C: 100, Best MSE: 0.0879, Time: 1.34 seconds
Year 1941 - Best C: 100, Best MSE: 0.106, Time: 1.35 seconds
Year 1942 - Best C: 100, Best MSE: 0.1095, Time: 1.46 seconds
Year 1943 - Best C: 0.1, Best MSE: 0.1078, Time: 1.53 seconds
Year 1944 - Best C: 100, Best MSE: 0.0938, Time: 1.53 seconds
Year 1945 - Best C: 100, Best MSE: 0.107, Time: 2.09 seconds
Year 1946 - Best C: 100, Best MSE: 0.0807, Time: 1.80 seconds
Year 1947 - Best C: 100, Best MSE: 0.0833, Time: 2.06 seconds
Year 1948 - Best C: 100, Best MSE: 0.103, Time: 1.82 seconds
Year 1949 - 

In [17]:
############################################
# DECISION TREE CLASSIFICATION MODEL
############################################

model_name = 'DT_class_roll20_MSE'


# Predefined set of max_depth values for hyperparameter tuning
HP1 = [3, 5, 10, 15, 25, None] # max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 7

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = float('inf')
    
    # Hyperparameter tuning for max_depth
    for hp1 in HP1:
        model = DecisionTreeClassifier(max_depth=hp1)
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training window with the best max_depth value
    model = DecisionTreeClassifier(max_depth=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best Max Depth: {best_HP1}, Best MSE: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")


Year 1933 - Best Max Depth: 10, Best MSE: 0.096, Time: 0.51 seconds
Year 1934 - Best Max Depth: 10, Best MSE: 0.0934, Time: 0.61 seconds
Year 1935 - Best Max Depth: 10, Best MSE: 0.108, Time: 0.70 seconds
Year 1936 - Best Max Depth: 10, Best MSE: 0.1102, Time: 0.83 seconds
Year 1937 - Best Max Depth: 10, Best MSE: 0.0984, Time: 0.95 seconds
Year 1938 - Best Max Depth: 10, Best MSE: 0.0788, Time: 1.07 seconds
Year 1939 - Best Max Depth: 10, Best MSE: 0.085, Time: 1.19 seconds
Year 1940 - Best Max Depth: 5, Best MSE: 0.0868, Time: 1.28 seconds
Year 1941 - Best Max Depth: 10, Best MSE: 0.1145, Time: 1.50 seconds
Year 1942 - Best Max Depth: 10, Best MSE: 0.1178, Time: 1.62 seconds
Year 1943 - Best Max Depth: 10, Best MSE: 0.1106, Time: 1.80 seconds
Year 1944 - Best Max Depth: 5, Best MSE: 0.0965, Time: 1.89 seconds
Year 1945 - Best Max Depth: 10, Best MSE: 0.1117, Time: 2.17 seconds
Year 1946 - Best Max Depth: 10, Best MSE: 0.0806, Time: 2.26 seconds
Year 1947 - Best Max Depth: 10, Best MS

## Forming Portfolios, Value-weighted portfolio returns

In [18]:
df.head()

Unnamed: 0,PERMNO,date,RET,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,LMKT,IVOL,y,year,ols_default,ridge_clas_exp_MSE,DT_clas_exp_MSE,ridge_clas_roll5_MSE,DT_class_roll5_MSE,ridge_clas_roll10_MSE,DT_class_roll10_MSE,ridge_clas_roll20_MSE,DT_class_roll20_MSE
0,11148,1926-08-31,0.109924,76287.5,1.0,0.0,4.0,0.0,0,0,0.0318,4e-05,1,1926,,,,,,,,,
1,10874,1926-08-31,0.168142,16500.0,0.0,0.0,2.0,0.0,2,0,0.0318,0.000537,1,1926,,,,,,,,,
2,12968,1926-08-31,-0.006464,55683.0,0.0,0.0,0.0,0.0,0,0,0.0318,7e-06,0,1926,,,,,,,,,
3,12976,1926-08-31,0.156041,41650.0,1.0,0.0,0.0,0.0,0,0,0.0318,7e-05,1,1926,,,,,,,,,
4,12984,1926-08-31,0.046875,11323.0,1.0,0.0,0.0,0.0,0,0,0.0318,0.000874,1,1926,,,,,,,,,


In [19]:
prediction_cols
# prediction_cols = ['logit_default','OLS_default','logit_roll6','DT_reg_roll']

['ols_default',
 'ridge_clas_exp_MSE',
 'DT_clas_exp_MSE',
 'ridge_clas_roll5_MSE',
 'DT_class_roll5_MSE',
 'ridge_clas_roll10_MSE',
 'DT_class_roll10_MSE',
 'ridge_clas_roll20_MSE',
 'DT_class_roll20_MSE']

In [20]:
portfolio = df[['date', 'RET', 'ME', 'y'] + prediction_cols].copy()
portfolio['date'] = pd.to_datetime(portfolio['date'])

# drop rows with missing values
portfolio.dropna(inplace=True)

portfolio.head()

Unnamed: 0,date,RET,ME,y,ols_default,ridge_clas_exp_MSE,DT_clas_exp_MSE,ridge_clas_roll5_MSE,DT_class_roll5_MSE,ridge_clas_roll10_MSE,DT_class_roll10_MSE,ridge_clas_roll20_MSE,DT_class_roll20_MSE
49083,1933-01-31,0.036765,31918.875,1,0.700424,0.995678,1.0,0.993238,1.0,0.994455,1.0,0.994455,1.0
49084,1933-01-31,0.121849,7142.25,1,0.472653,0.481409,0.267922,0.464035,0.340836,0.478552,0.269823,0.478552,0.269823
49085,1933-01-31,0.015625,1560.0,1,0.775651,0.99994,1.0,0.99985,1.0,0.999904,1.0,0.999904,1.0
49086,1933-01-31,-0.084337,4275.0,0,0.050251,1.5e-05,0.0,2.6e-05,0.0,1.9e-05,0.0,1.9e-05,0.0
49087,1933-01-31,-0.0375,447581.75,0,0.369477,0.318965,0.160714,0.329626,0.354545,0.314195,0.160714,0.314195,0.160714


In [21]:
portfolio.tail()

Unnamed: 0,date,RET,ME,y,ols_default,ridge_clas_exp_MSE,DT_clas_exp_MSE,ridge_clas_roll5_MSE,DT_class_roll5_MSE,ridge_clas_roll10_MSE,DT_class_roll10_MSE,ridge_clas_roll20_MSE,DT_class_roll20_MSE
3627748,2022-07-31,-0.106939,3731571.0,0,0.626592,0.820784,0.771429,0.892252,0.757556,0.902244,0.466019,0.837965,0.449787
3627749,2022-07-31,0.020969,5066.263,1,0.526688,0.663772,0.568068,0.838531,0.757556,0.841806,0.131148,0.751108,0.449787
3627750,2022-07-31,0.184785,1280287.0,1,0.799235,0.969509,0.990749,0.984135,0.757556,0.985462,0.98789,0.976392,0.983327
3627751,2022-07-31,0.108881,34757.11,1,0.493767,0.609127,0.568068,0.787141,0.757556,0.797412,0.131148,0.694356,0.449787
3627752,2022-07-31,0.323765,931110600.0,1,0.717418,0.920358,0.96875,0.949727,0.757556,0.95562,0.466019,0.925194,0.898927


In [22]:
# Initialize an empty DataFrame to store value-weighted returns for each model
vwreturns = pd.DataFrame(portfolio['date'].unique(), columns=['date'])  # Ensures all dates are included

for pred_col in prediction_cols:
    # Calculate deciles for this prediction
    decile_col = f'decile_{pred_col}'
    portfolio[decile_col] = portfolio.groupby(['date'])[pred_col].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    
    # Determine position based on deciles
    position_col = f'position_{pred_col}'
    portfolio[position_col] = np.where(portfolio[decile_col] == 9, 1, np.where(portfolio[decile_col] == 0, -1, 0))
    
    # Calculate the value-weighted return for this prediction
    vwret_col = f'vwreturn_{pred_col}'
    vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
    
    # Merge the temporary value-weighted returns with the main vwreturns DataFrame
    vwreturns = vwreturns.merge(vwreturns_temp, on='date', how='left')

# Ensure the 'date' column is the first column and is sorted
vwreturns = vwreturns.sort_values('date').reset_index(drop=True)


  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['

In [23]:
vwreturns.head()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_clas_exp_MSE,vwreturn_DT_clas_exp_MSE,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_ridge_clas_roll10_MSE,vwreturn_DT_class_roll10_MSE,vwreturn_ridge_clas_roll20_MSE,vwreturn_DT_class_roll20_MSE
0,1933-01-31,0.020306,0.021557,0.005294,0.021557,0.006217,0.021557,0.005146,0.021557,0.005146
1,1933-02-28,0.011009,0.009001,0.09254,0.008985,0.065318,0.009001,0.095244,0.009001,0.095244
2,1933-03-31,0.028039,0.028465,0.026435,0.028465,0.02624,0.028465,0.023226,0.028465,0.023226
3,1933-04-30,0.075182,0.05033,-0.009979,0.050056,-0.00712,0.05033,-0.018039,0.05033,-0.018039
4,1933-05-31,0.024181,0.019323,-0.246518,0.019883,-0.246518,0.019323,-0.246518,0.019323,-0.246518


### Compare to market data

In [24]:
#market = pd.read_csv('FF3_clean.csv')
market = pd.read_csv('/kaggle/input/sign-prediction-datasets/FF3_clean.csv')

In [25]:
market.head()

Unnamed: 0,date,Mkt-RF,SMB,HML,RF
0,1926-07-31,2.96,-2.56,-2.43,0.22
1,1926-08-31,2.64,-1.17,3.82,0.25
2,1926-09-30,0.36,-1.4,0.13,0.23
3,1926-10-31,-3.24,-0.09,0.7,0.32
4,1926-11-30,2.53,-0.1,-0.51,0.31


In [26]:
# create a new 'Mkt' which is a sum of Mkt-RF and RF
market['Mkt'] = market['Mkt-RF'] + market['RF']

# divide all columns by 100 except 'date'
market.iloc[:, 1:] = market.iloc[:, 1:] / 100

#set the 'date' column to datetime format
market['date'] = pd.to_datetime(market['date'])

# merge the market data (only date and Mkt columns) with the vwreturns DataFrame
vwreturns = vwreturns.merge(market[['date', 'Mkt']], on='date', how='left')

# transform all columns (except 'date') to a log: log(x+1) and save the result as lvwreturns
lvwreturns = vwreturns.copy()
lvwreturns.iloc[:, 1:] = np.log(vwreturns.iloc[:, 1:] + 1)

In [27]:
vwreturns.head()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_clas_exp_MSE,vwreturn_DT_clas_exp_MSE,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_ridge_clas_roll10_MSE,vwreturn_DT_class_roll10_MSE,vwreturn_ridge_clas_roll20_MSE,vwreturn_DT_class_roll20_MSE,Mkt
0,1933-01-31,0.020306,0.021557,0.005294,0.021557,0.006217,0.021557,0.005146,0.021557,0.005146,0.0126
1,1933-02-28,0.011009,0.009001,0.09254,0.008985,0.065318,0.009001,0.095244,0.009001,0.095244,-0.1527
2,1933-03-31,0.028039,0.028465,0.026435,0.028465,0.02624,0.028465,0.023226,0.028465,0.023226,0.0333
3,1933-04-30,0.075182,0.05033,-0.009979,0.050056,-0.00712,0.05033,-0.018039,0.05033,-0.018039,0.3895
4,1933-05-31,0.024181,0.019323,-0.246518,0.019883,-0.246518,0.019323,-0.246518,0.019323,-0.246518,0.2147


In [28]:
lvwreturns.head()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_clas_exp_MSE,vwreturn_DT_clas_exp_MSE,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_ridge_clas_roll10_MSE,vwreturn_DT_class_roll10_MSE,vwreturn_ridge_clas_roll20_MSE,vwreturn_DT_class_roll20_MSE,Mkt
0,1933-01-31,0.020103,0.021328,0.00528,0.021328,0.006198,0.021328,0.005133,0.021328,0.005133,0.012521
1,1933-02-28,0.010949,0.008961,0.088505,0.008945,0.063273,0.008961,0.090977,0.008961,0.090977,-0.1657
2,1933-03-31,0.027653,0.028067,0.026092,0.028067,0.025901,0.028067,0.02296,0.028067,0.02296,0.032758
3,1933-04-30,0.07249,0.049104,-0.01003,0.048844,-0.007145,0.049104,-0.018204,0.049104,-0.018204,0.328944
4,1933-05-31,0.023893,0.019139,-0.283051,0.019688,-0.283051,0.019139,-0.283051,0.019139,-0.283051,0.194497


In [29]:
lvwreturns.describe()

Unnamed: 0,date,vwreturn_ols_default,vwreturn_ridge_clas_exp_MSE,vwreturn_DT_clas_exp_MSE,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_ridge_clas_roll10_MSE,vwreturn_DT_class_roll10_MSE,vwreturn_ridge_clas_roll20_MSE,vwreturn_DT_class_roll20_MSE,Mkt
count,1074,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0
mean,1977-11-05 12:12:04.022346368,0.01548,0.010901,0.012606,0.011816,0.011275,0.011211,0.012072,0.010808,0.01261,0.008968
min,1933-01-31 00:00:00,-0.019337,-0.020949,-0.283051,-0.020949,-0.283051,-0.020949,-0.283051,-0.020949,-0.283051,-0.272203
25%,1955-06-07 12:00:00,0.010207,0.004928,0.003254,0.006952,0.002774,0.006235,0.002962,0.005225,0.003107,-0.017248
50%,1977-11-15 00:00:00,0.013988,0.01025,0.007472,0.010935,0.007722,0.010582,0.007589,0.010022,0.007395,0.013064
75%,2000-03-23 06:00:00,0.018582,0.014832,0.017071,0.015351,0.018238,0.014781,0.017542,0.014518,0.017291,0.038162
max,2022-07-31 00:00:00,0.086511,0.080615,0.193999,0.081136,0.187124,0.080905,0.193999,0.080598,0.193999,0.328944
std,,0.008332,0.008244,0.020918,0.007797,0.024628,0.00764,0.023912,0.008176,0.02108,0.048186


In [30]:
## plot histograms of the value-weighted returns for each model and the market in lvwreturns
#plt.figure(figsize=(12, round(len(prediction_cols)/2) * 5 ))
#
#for i, pred_col in enumerate(prediction_cols):
#    plt.subplot(len(prediction_cols)/2 +1, 2, i+1)
#    plt.hist(lvwreturns[f'vwreturn_{pred_col}'], bins=50, color='skyblue', edgecolor='black')
#    plt.title(f'Value-Weighted Return - {pred_col}')
#    plt.xlabel('Value-Weighted Return')
#    plt.ylabel('Frequency')
#    # calculate mean, skewness and kurtosis and add their values to the plot as a text, aligning to the top right corner
#    mean = lvwreturns[f'vwreturn_{pred_col}'].mean()
#    skewness = lvwreturns[f'vwreturn_{pred_col}'].skew()
#    kurtosis = lvwreturns[f'vwreturn_{pred_col}'].kurtosis()
#
#    plt.text(0.95, 0.95, f'Mean: {mean:.4f}\nSkewness: {skewness:.4f}\nKurtosis: {kurtosis:.4f}', ha='right', va='top', transform=plt.gca().transAxes)
#
#
#
#plt.subplot(round(len(prediction_cols)/2) +1, 2, len(prediction_cols)+1)
#plt.title('Value-Weighted Return - Market')
#plt.xlabel('Value-Weighted Return')
#plt.ylabel('Frequency')
#plt.hist(lvwreturns['Mkt'], bins=50, color='skyblue', edgecolor='black')
#mean = lvwreturns['Mkt'].mean()
#skewness = lvwreturns['Mkt'].skew()
#kurtosis = lvwreturns['Mkt'].kurtosis()
#plt.text(0.95, 0.95, f'Mean: {mean:.4f}\nSkewness: {skewness:.4f}\nKurtosis: {kurtosis:.4f}', ha='right', va='top', transform=plt.gca().transAxes)
#
#plt.tight_layout()
#plt.show()
#

In [31]:
## plot cumulative sums of the value-weighted log returns
#plt.figure(figsize=(12, 6))
#plt.plot(lvwreturns['date'], lvwreturns.iloc[:, 1:].cumsum())
#plt.title('Cumulative Value-Weighted Log Returns')
#plt.xlabel('Date')
#plt.ylabel('Cumulative Value-Weighted Log Returns')
#plt.legend(prediction_cols + ['Market'])
#plt.show()


In [32]:
# save the lvwreturns and portfolio DataFrame to a parquet file into 'outputs' folder

# for reproducibility and visualization purposes
lvwreturns.to_parquet('lvwreturnsC.parquet')
portfolio.to_parquet('portfolioC.parquet')

# save vwreturns DataFrame to a .dta file into 'outputs' folder
#vwreturns.to_stata('outputs/vwreturns.dta') # for backtasting in R - we need normal returns, not log returns
