# Framework for predictions and portfolio forming

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

# import the parquet library
import pyarrow.parquet as pq

# import model libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# load 'basemodel.parquet'
#df = pd.read_parquet('basemodel.parquet')
df = pd.read_parquet('/kaggle/input/sign-prediction-datasets/basicmarket.parquet')
prediction_cols = []
df.head()

Unnamed: 0,PERMNO,date,RET,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,LMKT,IVOL,y,STR,LTURNOVER,IMOM,MOM,LTR,PCTHIGH,IVOL2,IVOL3,MVOL,MVOL2,MVOL3,LMKT2,LMKT3,MMOM,MIMOM,MLTR,LSPREAD
0,10000,1987-03-31,-0.384615,973.25,0.0,0.0,0.0,1.0,0,0,0.0482,0.000612,0,0.0,0.100694,-1.089044,-2.079441,-0.297252,0.091549,0.00083,0.002413,0.545027,0.771919,0.43419,0.1289,-0.0278,0.186212,0.112764,0.45911,0.076923
1,10000,1987-04-30,-0.0625,912.44134,0.0,0.0,0.0,0.0,0,1,0.0211,0.003465,0,-0.384615,0.285384,-1.459321,-2.390877,0.014185,0.0625,0.000612,0.00083,0.672597,0.545027,0.771919,0.0482,0.1289,0.179935,-0.025601,0.498953,0.625
2,10000,1987-05-31,-0.066667,851.59375,0.0,0.0,0.0,0.0,0,2,-0.0167,0.001893,0,-0.0625,0.256358,-1.633155,-2.772587,-0.089613,0.075378,0.003465,0.000612,1.565461,0.672597,0.545027,0.0211,0.0482,0.208747,0.032263,0.488026,0.13333
3,10001,1987-03-31,0.0368,6317.625,0.0,0.0,0.0,1.0,0,0,0.0482,0.001627,1,-0.074074,0.368315,0.071156,0.140122,0.020203,0.892857,0.001556,0.001527,0.545027,0.771919,0.43419,0.1289,-0.0278,0.186212,0.112764,0.45911,0.12
4,10001,1987-04-30,-0.039216,6069.875,0.0,0.0,0.0,0.0,0,0,0.0211,0.001597,0,0.0368,0.217962,0.043187,0.038273,0.04509,0.910714,0.001627,0.001556,0.672597,0.545027,0.771919,0.0482,0.1289,0.179935,-0.025601,0.498953,0.078431


In [3]:
# select the columns to be used for prediction
X_col = ['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL', # initial columns
         'STR',	'LTURNOVER', 'IMOM', 'MOM',	'LTR', 'PCTHIGH', 'IVOL2', 'IVOL3', 'LSPREAD',	# stock specific columns
         'MVOL', 'MVOL2', 'MVOL3', 'LMKT2',	'LMKT3', 'MMOM', 'MIMOM', 'MLTR'] # market specific columns

In [4]:
# scale the data for faster coeficient convergence
scaler = MinMaxScaler()

df[X_col] = scaler.fit_transform(df[X_col])

In [5]:
# Convert 'date' to datetime format (if not already done) and sort the DataFrame
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)
df.reset_index(drop=True, inplace=True)

# Create a 'year' column based on the 'date' column
df['year'] = df['date'].dt.year

# Paper Replication - OLS and Logit, Expanding Window - No Hyperparameters
- They start with out of sample forecasting in 1932
- models will be named model_default

### Linear Regression (Pooled OLS)

In [6]:
#################################
# OLS, default, exp window
#################################

model_name = 'base_ols_default'


# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

for year in range(df['year'].min() + 6, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define the training data up until this year
    train_data = df[df['year'] < year]
    
    X_train = train_data[['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL']]
    y_train = train_data['y']
    
    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL']]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Time: {iteration_time:.2f} seconds")

Year 1933 - Time: 0.09 seconds
Year 1934 - Time: 0.13 seconds
Year 1935 - Time: 0.15 seconds
Year 1936 - Time: 0.11 seconds
Year 1937 - Time: 0.12 seconds
Year 1938 - Time: 0.12 seconds
Year 1939 - Time: 0.14 seconds
Year 1940 - Time: 0.15 seconds
Year 1941 - Time: 0.16 seconds
Year 1942 - Time: 0.15 seconds
Year 1943 - Time: 0.16 seconds
Year 1944 - Time: 0.19 seconds
Year 1945 - Time: 0.18 seconds
Year 1946 - Time: 0.21 seconds
Year 1947 - Time: 0.19 seconds
Year 1948 - Time: 0.19 seconds
Year 1949 - Time: 0.21 seconds
Year 1950 - Time: 0.19 seconds
Year 1951 - Time: 0.22 seconds
Year 1952 - Time: 0.22 seconds
Year 1953 - Time: 0.19 seconds
Year 1954 - Time: 0.25 seconds
Year 1955 - Time: 0.25 seconds
Year 1956 - Time: 0.25 seconds
Year 1957 - Time: 0.27 seconds
Year 1958 - Time: 0.29 seconds
Year 1959 - Time: 0.25 seconds
Year 1960 - Time: 0.28 seconds
Year 1961 - Time: 0.28 seconds
Year 1962 - Time: 0.28 seconds
Year 1963 - Time: 0.29 seconds
Year 1964 - Time: 0.30 seconds
Year 196

In [7]:
#################################
# OLS, default, exp window
#################################

model_name = 'ols_default'


# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

for year in range(df['year'].min() + 6, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define the training data up until this year
    train_data = df[df['year'] < year]
    
    X_train = train_data[X_col]
    y_train = train_data['y']
    
    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Time: {iteration_time:.2f} seconds")

Year 1933 - Time: 0.12 seconds
Year 1934 - Time: 0.17 seconds
Year 1935 - Time: 0.20 seconds
Year 1936 - Time: 0.20 seconds
Year 1937 - Time: 0.22 seconds
Year 1938 - Time: 0.23 seconds
Year 1939 - Time: 0.24 seconds
Year 1940 - Time: 0.21 seconds
Year 1941 - Time: 0.27 seconds
Year 1942 - Time: 0.26 seconds
Year 1943 - Time: 0.31 seconds
Year 1944 - Time: 0.31 seconds
Year 1945 - Time: 0.33 seconds
Year 1946 - Time: 0.32 seconds
Year 1947 - Time: 0.30 seconds
Year 1948 - Time: 0.37 seconds
Year 1949 - Time: 0.37 seconds
Year 1950 - Time: 0.38 seconds
Year 1951 - Time: 0.42 seconds
Year 1952 - Time: 0.40 seconds
Year 1953 - Time: 0.45 seconds
Year 1954 - Time: 0.45 seconds
Year 1955 - Time: 0.48 seconds
Year 1956 - Time: 0.49 seconds
Year 1957 - Time: 0.50 seconds
Year 1958 - Time: 0.51 seconds
Year 1959 - Time: 0.55 seconds
Year 1960 - Time: 0.55 seconds
Year 1961 - Time: 0.57 seconds
Year 1962 - Time: 0.59 seconds
Year 1963 - Time: 0.73 seconds
Year 1964 - Time: 0.77 seconds
Year 196

In [8]:
df.tail()

Unnamed: 0,PERMNO,date,RET,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,LMKT,IVOL,y,STR,LTURNOVER,IMOM,MOM,LTR,PCTHIGH,IVOL2,IVOL3,MVOL,MVOL2,MVOL3,LMKT2,LMKT3,MMOM,MIMOM,MLTR,LSPREAD,year,base_ols_default,ols_default
3076047,20412,2022-07-31,-0.203046,41747.87,0.166667,0.0,0.083333,0.0,0.0,0.833333,0.304629,7.4e-05,0,0.030776,5.6e-05,0.575618,0.631158,0.707753,0.068333,0.000222,0.000118,0.109743,0.128462,0.073585,0.423071,0.288758,0.520342,0.538953,0.814156,0.009789,2022,-0.784252,-0.838064
3076048,16874,2022-07-31,0.135593,1110135.0,0.25,0.0,0.166667,0.0,0.0,0.0,0.304629,6e-06,1,0.038978,1.5e-05,0.607527,0.717377,0.713835,0.811097,3.4e-05,1e-05,0.109743,0.128462,0.073585,0.423071,0.288758,0.520342,0.538953,0.814156,0.00215,2022,0.859305,0.83658
3076049,20395,2022-07-31,0.07497,3728386.0,0.0,0.0,0.083333,0.0,0.166667,0.0,0.304629,5e-06,1,0.040109,5.4e-05,0.604234,0.735304,0.708705,1.0,3e-06,4e-06,0.109743,0.128462,0.073585,0.423071,0.288758,0.520342,0.538953,0.814156,0.00134,2022,0.90733,0.869929
3076050,16857,2022-07-31,-0.029348,130494.1,0.0,0.0,0.0,0.0,0.0,0.25,0.304629,7.1e-05,0,0.038915,8.1e-05,0.524726,0.645115,0.780172,0.408233,0.00013,0.000202,0.109743,0.128462,0.073585,0.423071,0.288758,0.520342,0.538953,0.814156,0.00431,2022,0.088415,0.048938
3076051,93436,2022-07-31,0.323765,931110600.0,0.166667,0.0,0.083333,0.0,0.0,0.0,0.304629,3e-05,1,0.035278,0.000218,0.639349,0.729311,0.866215,0.587806,3.3e-05,6.1e-05,0.109743,0.128462,0.073585,0.423071,0.288758,0.520342,0.538953,0.814156,0.004467,2022,0.723794,0.825924


# My Experiments

## Machine Learning - Hyperparameter Tuning included in the process
- models to be named 'model_clas/reg_exp/roll'

### First expanding, then rolling
start predicting for 1932, expand the window until you reach X years, then roll it

#### MSE Evaluation

##### 5 years

In [9]:
rolling_window = 5

In [10]:
############################################
# RIDGE CLASSIFICATION MODEL - MSE
############################################

model_name = 'ridge_clas_roll5_MSE'  # Name of the new column for storing predictions
start_time2 = time.time()  # Start timing


# Predefined set of C values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100] # C

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 6

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
       
    best_HP1 = None
    best_mse = np.inf
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = LogisticRegression(C=hp1, max_iter=1000, penalty='l2')  # Ridge
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    
    # Retrain on the entire training window (excluding tuning year) with the best C value
    model = LogisticRegression(C=best_HP1, max_iter=1000, penalty='l2')
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best C: {best_HP1}, Best ACC: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")


end_time2 = time.time()  # End timing
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1933 - Best C: 100, Best ACC: 0.0749, Time: 7.99 seconds
Year 1934 - Best C: 100, Best ACC: 0.0505, Time: 7.68 seconds
Year 1935 - Best C: 100, Best ACC: 0.0609, Time: 8.69 seconds
Year 1936 - Best C: 100, Best ACC: 0.0709, Time: 9.38 seconds
Year 1937 - Best C: 100, Best ACC: 0.0612, Time: 9.87 seconds
Year 1938 - Best C: 100, Best ACC: 0.0532, Time: 8.12 seconds
Year 1939 - Best C: 100, Best ACC: 0.0578, Time: 8.45 seconds
Year 1940 - Best C: 100, Best ACC: 0.0628, Time: 10.44 seconds
Year 1941 - Best C: 100, Best ACC: 0.0758, Time: 11.58 seconds
Year 1942 - Best C: 100, Best ACC: 0.0717, Time: 10.36 seconds
Year 1943 - Best C: 100, Best ACC: 0.0696, Time: 13.02 seconds
Year 1944 - Best C: 100, Best ACC: 0.063, Time: 13.22 seconds
Year 1945 - Best C: 100, Best ACC: 0.0743, Time: 11.39 seconds
Year 1946 - Best C: 100, Best ACC: 0.0459, Time: 10.63 seconds
Year 1947 - Best C: 100, Best ACC: 0.0786, Time: 11.20 seconds
Year 1948 - Best C: 100, Best ACC: 0.0793, Time: 14.05 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1981 - Best C: 100, Best ACC: 0.0531, Time: 48.79 seconds
Year 1982 - Best C: 100, Best ACC: 0.0432, Time: 39.53 seconds
Year 1983 - Best C: 100, Best ACC: 0.0455, Time: 41.62 seconds
Year 1984 - Best C: 100, Best ACC: 0.0518, Time: 41.08 seconds
Year 1985 - Best C: 100, Best ACC: 0.0625, Time: 63.39 seconds
Year 1986 - Best C: 100, Best ACC: 0.0629, Time: 72.30 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1987 - Best C: 100, Best ACC: 0.0602, Time: 89.31 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1988 - Best C: 100, Best ACC: 0.0714, Time: 113.05 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1989 - Best C: 100, Best ACC: 0.0843, Time: 126.17 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1990 - Best C: 100, Best ACC: 0.0716, Time: 124.49 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1991 - Best C: 100, Best ACC: 0.0691, Time: 135.00 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1992 - Best C: 100, Best ACC: 0.0711, Time: 136.56 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1993 - Best C: 100, Best ACC: 0.0691, Time: 141.44 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1994 - Best C: 100, Best ACC: 0.0655, Time: 124.43 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1995 - Best C: 100, Best ACC: 0.0661, Time: 121.10 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1996 - Best C: 100, Best ACC: 0.0603, Time: 130.20 seconds
Year 1997 - Best C: 100, Best ACC: 0.0539, Time: 130.36 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1998 - Best C: 100, Best ACC: 0.0443, Time: 124.60 seconds
Year 1999 - Best C: 100, Best ACC: 0.0383, Time: 141.03 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2000 - Best C: 100, Best ACC: 0.0395, Time: 143.55 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2001 - Best C: 100, Best ACC: 0.0341, Time: 154.41 seconds
Year 2002 - Best C: 100, Best ACC: 0.0295, Time: 144.61 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2003 - Best C: 100, Best ACC: 0.0291, Time: 157.46 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2004 - Best C: 100, Best ACC: 0.0336, Time: 137.29 seconds
Year 2005 - Best C: 100, Best ACC: 0.0314, Time: 131.91 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2006 - Best C: 100, Best ACC: 0.0295, Time: 115.84 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2007 - Best C: 100, Best ACC: 0.0301, Time: 111.07 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2008 - Best C: 100, Best ACC: 0.0271, Time: 110.56 seconds
Year 2009 - Best C: 100, Best ACC: 0.0291, Time: 84.45 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2010 - Best C: 100, Best ACC: 0.0391, Time: 98.73 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2011 - Best C: 100, Best ACC: 0.0236, Time: 95.57 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2012 - Best C: 100, Best ACC: 0.0274, Time: 89.28 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2013 - Best C: 100, Best ACC: 0.0346, Time: 84.49 seconds
Year 2014 - Best C: 100, Best ACC: 0.03, Time: 82.38 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2015 - Best C: 100, Best ACC: 0.027, Time: 78.60 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2016 - Best C: 100, Best ACC: 0.0288, Time: 75.76 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2017 - Best C: 100, Best ACC: 0.0256, Time: 79.58 seconds
Year 2018 - Best C: 100, Best ACC: 0.0296, Time: 68.41 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2019 - Best C: 100, Best ACC: 0.0315, Time: 81.78 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2020 - Best C: 100, Best ACC: 0.0234, Time: 71.24 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2021 - Best C: 100, Best ACC: 0.0415, Time: 76.01 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2022 - Best C: 100, Best ACC: 0.0376, Time: 82.61 seconds
Total time: 5319.60 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
############################################
# DECISION TREE CLASSIFICATION MODEL
############################################

model_name = 'DT_class_roll5_MSE'
start_time2 = time.time()  # Start timing

# Predefined set of max_depth values for hyperparameter tuning
HP1 = [3, 5, 10, 15, 25, None] # max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 6

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = np.inf
    
    # Hyperparameter tuning for max_depth
    for hp1 in HP1:
        model = DecisionTreeClassifier(max_depth=hp1)
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training window with the best max_depth value
    model = DecisionTreeClassifier(max_depth=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best Max Depth: {best_HP1}, Best ACC: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()  # End timing
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1933 - Best Max Depth: 5, Best ACC: 0.04, Time: 1.63 seconds
Year 1934 - Best Max Depth: 5, Best ACC: 0.03, Time: 1.67 seconds
Year 1935 - Best Max Depth: 5, Best ACC: 0.032, Time: 1.72 seconds
Year 1936 - Best Max Depth: 5, Best ACC: 0.0294, Time: 1.83 seconds
Year 1937 - Best Max Depth: 5, Best ACC: 0.0207, Time: 1.72 seconds
Year 1938 - Best Max Depth: 5, Best ACC: 0.0196, Time: 1.66 seconds
Year 1939 - Best Max Depth: 5, Best ACC: 0.0269, Time: 1.73 seconds
Year 1940 - Best Max Depth: 5, Best ACC: 0.03, Time: 1.78 seconds
Year 1941 - Best Max Depth: 5, Best ACC: 0.0388, Time: 1.94 seconds
Year 1942 - Best Max Depth: 5, Best ACC: 0.038, Time: 1.98 seconds
Year 1943 - Best Max Depth: 5, Best ACC: 0.0366, Time: 2.02 seconds
Year 1944 - Best Max Depth: 5, Best ACC: 0.0236, Time: 2.01 seconds
Year 1945 - Best Max Depth: 5, Best ACC: 0.0285, Time: 1.94 seconds
Year 1946 - Best Max Depth: 5, Best ACC: 0.014, Time: 2.01 seconds
Year 1947 - Best Max Depth: 5, Best ACC: 0.0168, Time: 1.

In [12]:
############################################
# RF CLASSIFICATION MODEL - ACCURACY
############################################

model_name = 'RF_class_roll5_MSE'
start_time2 = time.time()


# Predefined set of values for hyperparameter tuning
HP1 = [10, 50, 100, 200]  # Possible values for n_estimators
HP2 = [5, 10, 15, None]  # Possible values for max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 6

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_HP2 = None
    best_mse = np.inf

    # Hyperparameter tuning
    for hp1 in HP1:
        for hp2 in HP2:
            model = RandomForestClassifier(n_estimators=hp1, max_depth=hp2, random_state=42, n_jobs=-1)
            model.fit(X_train, y_train)
            predictions = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
            mse = mean_squared_error(y_tune, predictions)  # Calculate MSE
            
            # Lower MSE is better, so we invert the logic used for accuracy
            if mse < best_mse:
                best_mse = mse
                best_HP1 = hp1
                best_HP2 = hp2
    
    # Retrain on the entire training window with the best max_depth value
    model = model = RandomForestClassifier(n_estimators=best_HP1, max_depth=best_HP2, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best n_estimators: {best_HP1}, Best max_depth: {best_HP2 if best_HP2 is not None else 'None'}, Best ACC: {round(best_mse, 2)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1933 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.05, Time: 33.53 seconds
Year 1934 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.04, Time: 34.74 seconds
Year 1935 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.03, Time: 36.42 seconds
Year 1936 - Best n_estimators: 100, Best max_depth: None, Best ACC: 0.04, Time: 34.70 seconds
Year 1937 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.02, Time: 36.62 seconds
Year 1938 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.02, Time: 36.67 seconds
Year 1939 - Best n_estimators: 100, Best max_depth: 15, Best ACC: 0.04, Time: 33.87 seconds
Year 1940 - Best n_estimators: 50, Best max_depth: 15, Best ACC: 0.03, Time: 33.69 seconds
Year 1941 - Best n_estimators: 200, Best max_depth: 15, Best ACC: 0.04, Time: 39.12 seconds
Year 1942 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.04, Time: 40.97 seconds
Year 1943 - Best n_estimators: 200, Best max_depth: 15, Best ACC: 0

## Forming Portfolios, Value-weighted portfolio returns

In [13]:
df.head()

Unnamed: 0,PERMNO,date,RET,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,LMKT,IVOL,y,STR,LTURNOVER,IMOM,MOM,LTR,PCTHIGH,IVOL2,IVOL3,MVOL,MVOL2,MVOL3,LMKT2,LMKT3,MMOM,MIMOM,MLTR,LSPREAD,year,base_ols_default,ols_default,ridge_clas_roll5_MSE,DT_class_roll5_MSE,RF_class_roll5_MSE
0,14314,1927-08-31,-0.164557,3902.25,0.0,0.0,0.0,0.083333,0.0,0.0,0.538722,3.3e-05,0,0.040267,3e-06,0.551246,0.668133,0.709597,0.443203,3.6e-05,9.7e-05,0.00106,0.008129,0.001942,0.397061,0.511976,0.645637,0.529397,0.644251,0.002245,1927,,,,,
1,12730,1927-08-31,-0.069853,88107.25,0.0,0.416667,0.0,0.0,0.0,0.0,0.538722,5e-06,0,0.043893,1.9e-05,0.611572,0.736577,0.69712,0.824047,5e-06,4e-06,0.00106,0.008129,0.001942,0.397061,0.511976,0.645637,0.529397,0.644251,0.001956,1927,,,,,
2,11594,1927-08-31,0.14939,14137.5,0.0,0.083333,0.0,0.0,0.25,0.0,0.538722,7.7e-05,1,0.04547,2e-06,0.572206,0.709391,0.708355,0.869882,5e-06,1.8e-05,0.00106,0.008129,0.001942,0.397061,0.511976,0.645637,0.529397,0.644251,0.005677,1927,,,,,
3,75471,1927-08-31,0.216216,1575.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.538722,9e-05,1,0.040866,3e-06,0.5708,0.709576,0.684379,0.86031,3.8e-05,0.000123,0.00106,0.008129,0.001942,0.397061,0.511976,0.645637,0.529397,0.644251,0.003594,1927,,,,,
4,10786,1927-08-31,0.169811,111600.0,0.0,0.166667,0.583333,0.0,0.083333,0.0,0.538722,1.3e-05,1,0.045594,5.6e-05,0.596852,0.722659,0.703022,1.0,3e-06,6e-06,0.00106,0.008129,0.001942,0.397061,0.511976,0.645637,0.529397,0.644251,0.003555,1927,,,,,


In [14]:
prediction_cols
# prediction_cols = ['logit_default','OLS_default','logit_roll6','DT_reg_roll']

['base_ols_default',
 'ols_default',
 'ridge_clas_roll5_MSE',
 'DT_class_roll5_MSE',
 'RF_class_roll5_MSE']

In [15]:
portfolio = df[['date', 'RET', 'ME', 'y'] + prediction_cols].copy()
portfolio['date'] = pd.to_datetime(portfolio['date'])

# drop rows with missing values
portfolio.dropna(inplace=True)

portfolio.head()

Unnamed: 0,date,RET,ME,y,base_ols_default,ols_default,ridge_clas_roll5_MSE,DT_class_roll5_MSE,RF_class_roll5_MSE
39215,1933-01-31,0.058824,456.75,1,0.275076,0.58993,0.5548031,0.898343,0.63
39216,1933-01-31,0.427451,18036.0,1,0.756186,0.945905,1.0,1.0,0.91
39217,1933-01-31,-0.006993,1975.125,0,0.050649,0.231157,8.458596e-08,0.0,0.035
39218,1933-01-31,0.166667,241.5,1,0.704957,0.862059,0.9997302,1.0,0.895
39219,1933-01-31,-0.058824,3304.0,0,-0.222943,-0.008792,5.928324e-14,0.0,0.03


In [16]:
portfolio.tail()

Unnamed: 0,date,RET,ME,y,base_ols_default,ols_default,ridge_clas_roll5_MSE,DT_class_roll5_MSE,RF_class_roll5_MSE
3076047,2022-07-31,-0.203046,41747.87,0,-0.784252,-0.838064,5.857704e-73,0.0,0.05
3076048,2022-07-31,0.135593,1110135.0,1,0.859305,0.83658,0.9690855,0.992545,0.975
3076049,2022-07-31,0.07497,3728386.0,1,0.90733,0.869929,1.0,1.0,0.985
3076050,2022-07-31,-0.029348,130494.1,0,0.088415,0.048938,1.302841e-22,0.0,0.02
3076051,2022-07-31,0.323765,931110600.0,1,0.723794,0.825924,0.9967314,0.992545,0.96


In [17]:
# Initialize an empty DataFrame to store value-weighted returns for each model
vwreturns = pd.DataFrame(portfolio['date'].unique(), columns=['date'])  # Ensures all dates are included

for pred_col in prediction_cols:
    # Calculate deciles for this prediction
    decile_col = f'decile_{pred_col}'
    portfolio[decile_col] = portfolio.groupby(['date'])[pred_col].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    
    # Determine position based on deciles
    position_col = f'position_{pred_col}'
    portfolio[position_col] = np.where(portfolio[decile_col] == 9, 1, np.where(portfolio[decile_col] == 0, -1, 0))
    
    # Calculate the value-weighted return for this prediction
    vwret_col = f'vwreturn_{pred_col}'
    vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
    
    # Merge the temporary value-weighted returns with the main vwreturns DataFrame
    vwreturns = vwreturns.merge(vwreturns_temp, on='date', how='left')

# Ensure the 'date' column is the first column and is sorted
vwreturns = vwreturns.sort_values('date').reset_index(drop=True)


  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)


In [18]:
vwreturns.head()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE
0,1933-01-31,0.02035,0.018723,0.024002,0.021885,0.023204
1,1933-02-28,0.010923,0.011374,0.009797,0.14527,0.00997
2,1933-03-31,0.028053,0.019686,0.020832,0.027125,0.026779
3,1933-04-30,0.075213,0.103667,0.104251,-0.000179,0.011065
4,1933-05-31,0.024194,0.02214,0.02265,-0.004281,0.010737


### Compare to market data

In [19]:
#market = pd.read_csv('FF3_clean.csv')
market = pd.read_csv('/kaggle/input/sign-prediction-datasets/FF3_clean.csv')

In [20]:
market.head()

Unnamed: 0,date,Mkt-RF,SMB,HML,RF
0,1926-07-31,2.96,-2.56,-2.43,0.22
1,1926-08-31,2.64,-1.17,3.82,0.25
2,1926-09-30,0.36,-1.4,0.13,0.23
3,1926-10-31,-3.24,-0.09,0.7,0.32
4,1926-11-30,2.53,-0.1,-0.51,0.31


In [21]:
# create a new 'Mkt' which is a sum of Mkt-RF and RF
market['Mkt'] = market['Mkt-RF'] + market['RF']

# divide all columns by 100 except 'date'
market.iloc[:, 1:] = market.iloc[:, 1:] / 100

#set the 'date' column to datetime format
market['date'] = pd.to_datetime(market['date'])

# merge the market data (only date and Mkt columns) with the vwreturns DataFrame
vwreturns = vwreturns.merge(market[['date', 'Mkt']], on='date', how='left')

# transform all columns (except 'date') to a log: log(x+1) and save the result as lvwreturns
lvwreturns = vwreturns.copy()
lvwreturns.iloc[:, 1:] = np.log(vwreturns.iloc[:, 1:] + 1)

In [22]:
vwreturns.head()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE,Mkt
0,1933-01-31,0.02035,0.018723,0.024002,0.021885,0.023204,0.0126
1,1933-02-28,0.010923,0.011374,0.009797,0.14527,0.00997,-0.1527
2,1933-03-31,0.028053,0.019686,0.020832,0.027125,0.026779,0.0333
3,1933-04-30,0.075213,0.103667,0.104251,-0.000179,0.011065,0.3895
4,1933-05-31,0.024194,0.02214,0.02265,-0.004281,0.010737,0.2147


In [23]:
lvwreturns.head()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE,Mkt
0,1933-01-31,0.020146,0.01855,0.023718,0.021649,0.022939,0.012521
1,1933-02-28,0.010864,0.011309,0.009749,0.13564,0.009921,-0.1657
2,1933-03-31,0.027667,0.019495,0.020618,0.026763,0.026427,0.032758
3,1933-04-30,0.072519,0.098638,0.099167,-0.000179,0.011004,0.328944
4,1933-05-31,0.023906,0.021899,0.022398,-0.00429,0.01068,0.194497


In [24]:
lvwreturns.describe()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE,Mkt
count,1072,1072.0,1072.0,1072.0,1072.0,1072.0,1072.0
mean,1977-11-17 20:54:37.611940288,0.014942,0.014632,0.008362,0.017494,0.011965,0.009039
min,1933-01-31 00:00:00,-0.011114,-0.046196,-0.042875,-0.212917,-0.00716,-0.272203
25%,1955-05-23 06:00:00,0.009826,0.009861,0.003199,0.00361,0.006648,-0.017248
50%,1977-12-15 12:00:00,0.013586,0.013339,0.007465,0.01238,0.009968,0.013262
75%,2000-04-07 12:00:00,0.018027,0.01758,0.011767,0.026507,0.015334,0.038235
max,2022-07-31 00:00:00,0.078491,0.098638,0.099167,0.207777,0.076129,0.328944
std,,0.007929,0.007952,0.007029,0.027592,0.008354,0.048197


In [25]:
# save the lvwreturns and portfolio DataFrame to a parquet file into 'outputs' folder

# for reproducibility and visualization purposes
lvwreturns.to_parquet('market_lvwreturns_class1.parquet')
portfolio.to_parquet('market_portfolio_class1.parquet')

# save vwreturns DataFrame to a .dta file into 'outputs' folder
#vwreturns.to_stata('outputs/vwreturns.dta') # for backtasting in R - we need normal returns, not log returns
