# Framework for predictions and portfolio forming

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

# import the parquet library
import pyarrow.parquet as pq

# import model libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# load 'basemodel.parquet'
#df = pd.read_parquet('basemodel.parquet')
df= pd.read_parquet('/kaggle/input/sign-prediction-datasets/clean_financials.parquet')
prediction_cols = []
df.head()

Unnamed: 0,PERMNO,date,y,RET,LME,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,SPREAD_L,LTURNOVER,IVOL,IVOL2,IVOL3,STR,LTR,IMOM,MOM,PCTHIGH,MVOL,MVOL2,MVOL3,LMKT,LMKT2,LMKT3,MMOM,MIMOM,MLTR,release_L,LAT,DPI2A_L,CTO_L,D2A_L,NOA_L,OL_L,PCM_L,FC2Y_L,INVEST_L,RNA_L,S2E_L,PROF_L,PM_L,ATO_L,ROA_L,FCF_L,A2ME_L,B2ME_L,S2P_L,ROE_L,LEV_L,TQ_L
0,10000,1987-03-31,0,-0.384615,1581.53125,973.25,0.0,0.0,0.0,1.0,0,0,0.076923,0.100694,0.000612,0.00083,0.002413,0.0,-0.297252,-1.089044,-2.079441,0.091549,0.545027,0.771919,0.43419,0.0482,0.1289,-0.0278,0.186212,0.112764,0.45911,0.0,2.115,0.0,0.055319,0.037352,0.0,0.236407,-1.495726,1.777778,0.0,0.016564,0.279904,0.064593,0.230769,0.071779,-0.061939,-0.024586,0.001067,0.000211,5.9e-05,-8.3e-05,3.7e-05,748.571277
1,10000,1987-04-30,0,-0.0625,973.25,912.44134,0.0,0.0,0.0,0.0,0,1,0.625,0.285384,0.003465,0.000612,0.00083,-0.384615,0.014185,-1.459321,-2.390877,0.0625,0.672597,0.545027,0.771919,0.0211,0.0482,0.1289,0.179935,-0.025601,0.498953,0.0,2.115,0.0,0.055319,0.037352,0.0,0.236407,-1.495726,1.777778,0.0,0.016564,0.279904,0.064593,0.230769,0.071779,-0.061939,-0.024586,0.001067,0.000211,5.9e-05,-0.000135,6e-05,460.967849
2,10000,1987-05-31,0,-0.066667,912.44134,851.59375,0.0,0.0,0.0,0.0,0,2,0.13333,0.256358,0.001893,0.003465,0.000612,-0.0625,-0.089613,-1.633155,-2.772587,0.075378,1.565461,0.672597,0.545027,-0.0167,0.0211,0.0482,0.208747,0.032263,0.488026,1.0,2.115,-0.037825,0.074232,0.028309,-0.382033,0.293803,-1.101911,0.343949,-38.20331,-0.36864,-1.121429,2.535714,-2.261146,0.163032,-0.108274,-0.146901,0.00066,-7.1e-05,7.9e-05,-0.000251,1.8e-05,699.225968
3,10000,1987-06-30,0,0.0,851.59375,851.59375,0.0,0.0,0.0,0.0,0,0,0.071451,0.20935,0.00019,0.001893,0.003465,-0.066667,-0.341485,-1.323014,-2.585254,0.070707,0.955605,1.565461,0.672597,0.0049,-0.0167,0.0211,0.142069,-0.002095,0.591159,0.0,2.115,-0.037825,0.074232,0.028309,-0.382033,0.293803,-1.101911,0.343949,-38.20331,-0.36864,-1.121429,2.535714,-2.261146,0.163032,-0.108274,-0.146901,0.00066,-7.1e-05,7.9e-05,-0.000269,1.9e-05,652.670811
4,10005,1987-03-31,0,0.0,795.11688,795.11688,0.0,0.0,0.0,0.0,0,0,0.090936,0.016213,0.000532,0.021169,1.3e-05,0.1,-0.405465,-1.163152,-0.470004,0.578954,0.545027,0.771919,0.43419,0.0482,0.1289,-0.0278,0.186212,0.112764,0.45911,0.0,1.954,-0.001485,0.023762,0.015865,-0.032673,0.044012,0.645833,1.4375,-3.267327,-0.423313,0.025263,-0.036316,-1.4375,0.294479,-0.033168,-0.018424,0.004506,0.004381,0.000111,-8.4e-05,4.7e-05,406.945179


In [3]:
# select the columns to be used for prediction
X_col = ['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL', # initial columns
         'STR',	'LTURNOVER', 'IMOM', 'MOM',	'LTR', 'PCTHIGH', 'IVOL2', 'IVOL3', 'SPREAD_L',	# stock specific columns
         'MVOL', 'MVOL2', 'MVOL3', 'LMKT2',	'LMKT3', 'MMOM', 'MIMOM', 'MLTR', # market specific columns
         'LAT',	'DPI2A_L',	'CTO_L', 'D2A_L', 'NOA_L', 'OL_L',	'PCM_L', 'FC2Y_L',	
         'INVEST_L', 'RNA_L', 'S2E_L', 'PROF_L', 'PM_L', 'ATO_L', 'ROA_L', 	
         'FCF_L', 'A2ME_L',	'B2ME_L', 'S2P_L', 'ROE_L',	'LEV_L', 'TQ_L'] 

In [4]:
# scale the data for faster coeficient convergence
scaler = MinMaxScaler()

df[X_col] = scaler.fit_transform(df[X_col])

In [5]:
# Convert 'date' to datetime format (if not already done) and sort the DataFrame
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)
df.reset_index(drop=True, inplace=True)

# Create a 'year' column based on the 'date' column
df['year'] = df['date'].dt.year

# Paper Replication - OLS and Logit, Expanding Window - No Hyperparameters
- They start with out of sample forecasting in 1932
- models will be named model_default

### Linear Regression (Pooled OLS)

In [6]:
#################################
# OLS, default, exp window
#################################

model_name = 'base_ols_default'


# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

for year in range(df['year'].min() + 6, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define the training data up until this year
    train_data = df[df['year'] < year]
    
    X_train = train_data[['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL']]
    y_train = train_data['y']
    
    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[['bull_D', 'bear_D', 'bull_W', 'bear_W', 'bull_M', 'bear_M', 'LMKT', 'IVOL']]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Time: {iteration_time:.2f} seconds")

Year 1976 - Time: 0.13 seconds
Year 1977 - Time: 0.17 seconds
Year 1978 - Time: 0.17 seconds
Year 1979 - Time: 0.19 seconds
Year 1980 - Time: 0.20 seconds
Year 1981 - Time: 0.21 seconds
Year 1982 - Time: 0.23 seconds
Year 1983 - Time: 0.26 seconds
Year 1984 - Time: 0.30 seconds
Year 1985 - Time: 0.30 seconds
Year 1986 - Time: 0.31 seconds
Year 1987 - Time: 0.34 seconds
Year 1988 - Time: 0.38 seconds
Year 1989 - Time: 0.40 seconds
Year 1990 - Time: 0.44 seconds
Year 1991 - Time: 0.46 seconds
Year 1992 - Time: 0.51 seconds
Year 1993 - Time: 0.52 seconds
Year 1994 - Time: 0.54 seconds
Year 1995 - Time: 0.60 seconds
Year 1996 - Time: 0.62 seconds
Year 1997 - Time: 0.64 seconds
Year 1998 - Time: 0.70 seconds
Year 1999 - Time: 0.71 seconds
Year 2000 - Time: 0.74 seconds
Year 2001 - Time: 0.84 seconds
Year 2002 - Time: 0.92 seconds
Year 2003 - Time: 0.89 seconds
Year 2004 - Time: 0.92 seconds
Year 2005 - Time: 0.97 seconds
Year 2006 - Time: 0.99 seconds
Year 2007 - Time: 1.02 seconds
Year 200

In [7]:
#################################
# OLS, default, exp window
#################################

model_name = 'ols_default'


# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

for year in range(df['year'].min() + 6, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Define the training data up until this year
    train_data = df[df['year'] < year]
    
    X_train = train_data[X_col]
    y_train = train_data['y']
    
    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict(X_next_year)
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Time: {iteration_time:.2f} seconds")

Year 1976 - Time: 0.34 seconds
Year 1977 - Time: 0.46 seconds
Year 1978 - Time: 0.54 seconds
Year 1979 - Time: 0.68 seconds
Year 1980 - Time: 0.72 seconds
Year 1981 - Time: 0.80 seconds
Year 1982 - Time: 0.96 seconds
Year 1983 - Time: 0.95 seconds
Year 1984 - Time: 1.09 seconds
Year 1985 - Time: 1.21 seconds
Year 1986 - Time: 1.31 seconds
Year 1987 - Time: 1.50 seconds
Year 1988 - Time: 1.95 seconds
Year 1989 - Time: 1.97 seconds
Year 1990 - Time: 1.94 seconds
Year 1991 - Time: 2.07 seconds
Year 1992 - Time: 2.19 seconds
Year 1993 - Time: 2.37 seconds
Year 1994 - Time: 2.79 seconds
Year 1995 - Time: 2.82 seconds
Year 1996 - Time: 3.41 seconds
Year 1997 - Time: 3.41 seconds
Year 1998 - Time: 3.59 seconds
Year 1999 - Time: 4.33 seconds
Year 2000 - Time: 4.46 seconds
Year 2001 - Time: 4.55 seconds
Year 2002 - Time: 4.82 seconds
Year 2003 - Time: 5.00 seconds
Year 2004 - Time: 5.20 seconds
Year 2005 - Time: 5.38 seconds
Year 2006 - Time: 5.55 seconds
Year 2007 - Time: 5.58 seconds
Year 200

In [8]:
df.tail()

Unnamed: 0,PERMNO,date,y,RET,LME,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,SPREAD_L,LTURNOVER,IVOL,IVOL2,IVOL3,STR,LTR,IMOM,MOM,PCTHIGH,MVOL,MVOL2,MVOL3,LMKT,LMKT2,LMKT3,MMOM,MIMOM,MLTR,release_L,LAT,DPI2A_L,CTO_L,D2A_L,NOA_L,OL_L,PCM_L,FC2Y_L,INVEST_L,RNA_L,S2E_L,PROF_L,PM_L,ATO_L,ROA_L,FCF_L,A2ME_L,B2ME_L,S2P_L,ROE_L,LEV_L,TQ_L,year,base_ols_default,ols_default
1795158,14523,2022-07-31,0,-0.211268,421020.1,360376.8,0.0,0.0,0.0,0.0,0.0,0.0,0.010376,0.000127,0.000547,8.1e-05,1.8e-05,0.046569,0.752262,0.626562,0.667402,0.596191,0.108507,0.127252,0.072299,0.363567,0.568917,0.336051,0.428937,0.674246,0.832911,1.0,7.8e-05,0.007624,0.000401,9e-05,0.000694,0.000181,0.907457,0.105361,0.000694,0.261469,0.21747,0.304657,0.956337,0.008175,0.784626,0.140765,0.000122,0.797185,0.009079,0.522669,5e-06,0.000137,2022,0.528958,0.456078
1795159,84413,2022-07-31,1,0.020794,936742.6,956286.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.004925,6.8e-05,5.2e-05,0.000111,4e-05,0.042502,0.708189,0.669056,0.712299,0.767522,0.108507,0.127252,0.072299,0.363567,0.568917,0.336051,0.428937,0.674246,0.832911,1.0,0.000262,0.006655,0.0004,8.5e-05,0.000376,0.000203,0.907473,0.105362,0.000376,0.261528,0.217475,0.304657,0.956344,0.008166,0.78535,0.140878,9.4e-05,0.797068,0.009113,0.522682,3e-05,0.000168,2022,0.819094,0.816904
1795160,14526,2022-07-31,1,0.066548,868997.5,921952.8,0.166667,0.0,0.083333,0.0,0.0,0.0,0.004318,9.4e-05,4.2e-05,1.3e-05,1.9e-05,0.037465,0.704435,0.67181,0.72526,0.814332,0.108507,0.127252,0.072299,0.363567,0.568917,0.336051,0.428937,0.674246,0.832911,0.0,0.001048,0.006563,0.000466,0.000117,0.000394,0.00024,0.907461,0.105344,0.000394,0.261578,0.217481,0.304673,0.956347,0.008336,0.78569,0.141023,0.000383,0.797248,0.009861,0.522707,0.00015,3.7e-05,2022,0.71127,0.700782
1795161,90548,2022-07-31,1,0.087452,113813.2,121602.8,0.416667,0.0,0.083333,0.0,0.0,0.0,0.00575,2.5e-05,3.8e-05,6.6e-05,1.5e-05,0.034937,0.710461,0.697203,0.745026,0.673998,0.108507,0.127252,0.072299,0.363567,0.568917,0.336051,0.428937,0.674246,0.832911,1.0,0.000372,0.006682,0.000408,0.000135,0.000382,0.000187,0.907466,0.105348,0.000382,0.261564,0.217474,0.304668,0.956347,0.00827,0.785608,0.141003,0.001059,0.797898,0.010453,0.522709,0.000321,1.4e-05,2022,0.849967,0.9006
1795162,93436,2022-07-31,1,0.323765,701030200.0,931110600.0,0.166667,0.0,0.083333,0.0,0.0,0.0,0.004688,0.000218,3.8e-05,4.1e-05,7.7e-05,0.035278,0.866215,0.692366,0.729311,0.587806,0.108507,0.127252,0.072299,0.363567,0.568917,0.336051,0.428937,0.674246,0.832911,1.0,0.077968,0.00679,0.000415,0.000123,0.0004,0.000184,0.907462,0.105342,0.0004,0.261577,0.217473,0.304671,0.956348,0.00823,0.785736,0.141042,3.2e-05,0.797041,0.00904,0.522691,6e-06,0.000397,2022,0.711275,0.814605


# My Experiments

## Machine Learning - Hyperparameter Tuning included in the process
- models to be named 'model_clas/reg_exp/roll'

### First expanding, then rolling
start predicting for 1932, expand the window until you reach X years, then roll it

#### MSE Evaluation

##### 5 years

In [9]:
rolling_window = 5

In [10]:
############################################
# RIDGE CLASSIFICATION MODEL - MSE
############################################

model_name = 'ridge_clas_roll5_MSE'  # Name of the new column for storing predictions
start_time2 = time.time()  # Start timing


# Predefined set of C values for hyperparameter tuning
HP1 = [0.01, 0.1, 1, 10, 100] # C

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

# Define the start year for modeling based on having at least 7 years of data
start_modeling_year = df['year'].min() + 6

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Start timing
    
    # Determine the start year of the training window based on the current year
    train_start_year = max(year - rolling_window, df['year'].min())  # Ensure it does not go below the earliest year
    
    # Select the training data based on the calculated start year
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    # Split training data into actual training and tuning sets
    # Use the last year of the training data for tuning
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
       
    best_HP1 = None
    best_mse = np.inf # Initialize with infinity
    
    # Hyperparameter tuning
    for hp1 in HP1:
        model = LogisticRegression(C=hp1, max_iter=1000, penalty='l2')  # Ridge
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    
    # Retrain on the entire training window (excluding tuning year) with the best C value
    model = LogisticRegression(C=best_HP1, max_iter=1000, penalty='l2')
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_probabilities = model.predict_proba(X_next_year)[:, 1]  # Probability of the positive class
        df.loc[df['year'] == year, model_name] = next_year_probabilities
    
    end_time = time.time()  # End timing
    iteration_time = end_time - start_time  # Calculate iteration time
    
    print(f"Year {year} - Best C: {best_HP1}, Best ACC: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")


end_time2 = time.time()  # End timing
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1976 - Best C: 100, Best ACC: 0.0676, Time: 40.40 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1977 - Best C: 100, Best ACC: 0.0567, Time: 55.69 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1978 - Best C: 100, Best ACC: 0.0615, Time: 61.53 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1979 - Best C: 100, Best ACC: 0.0487, Time: 61.97 seconds
Year 1980 - Best C: 100, Best ACC: 0.0497, Time: 54.69 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1981 - Best C: 100, Best ACC: 0.051, Time: 62.29 seconds
Year 1982 - Best C: 100, Best ACC: 0.0438, Time: 51.80 seconds
Year 1983 - Best C: 100, Best ACC: 0.0484, Time: 54.98 seconds
Year 1984 - Best C: 100, Best ACC: 0.0511, Time: 55.04 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1985 - Best C: 100, Best ACC: 0.0622, Time: 62.83 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1986 - Best C: 100, Best ACC: 0.0648, Time: 73.09 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1987 - Best C: 100, Best ACC: 0.063, Time: 100.72 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1988 - Best C: 100, Best ACC: 0.0778, Time: 101.71 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1989 - Best C: 100, Best ACC: 0.087, Time: 118.59 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1990 - Best C: 100, Best ACC: 0.0715, Time: 123.24 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1991 - Best C: 100, Best ACC: 0.0688, Time: 127.22 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1992 - Best C: 100, Best ACC: 0.0711, Time: 130.29 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1993 - Best C: 100, Best ACC: 0.0692, Time: 130.79 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1994 - Best C: 100, Best ACC: 0.0665, Time: 120.29 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1995 - Best C: 100, Best ACC: 0.0651, Time: 123.82 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1996 - Best C: 100, Best ACC: 0.0599, Time: 115.24 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1997 - Best C: 100, Best ACC: 0.0529, Time: 129.08 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1998 - Best C: 100, Best ACC: 0.0449, Time: 129.13 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 1999 - Best C: 100, Best ACC: 0.0393, Time: 135.49 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2000 - Best C: 100, Best ACC: 0.0399, Time: 140.98 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2001 - Best C: 100, Best ACC: 0.0339, Time: 139.38 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2002 - Best C: 100, Best ACC: 0.0286, Time: 160.17 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2003 - Best C: 100, Best ACC: 0.0287, Time: 144.24 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2004 - Best C: 100, Best ACC: 0.0345, Time: 135.30 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2005 - Best C: 100, Best ACC: 0.0323, Time: 135.54 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2006 - Best C: 100, Best ACC: 0.0291, Time: 128.80 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2007 - Best C: 100, Best ACC: 0.0285, Time: 118.13 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2008 - Best C: 100, Best ACC: 0.0276, Time: 106.37 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2009 - Best C: 100, Best ACC: 0.0323, Time: 99.39 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2010 - Best C: 100, Best ACC: 0.0399, Time: 91.96 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2011 - Best C: 100, Best ACC: 0.0253, Time: 103.64 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2012 - Best C: 100, Best ACC: 0.0279, Time: 99.75 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2013 - Best C: 100, Best ACC: 0.0353, Time: 98.99 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2014 - Best C: 100, Best ACC: 0.031, Time: 96.22 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2015 - Best C: 100, Best ACC: 0.0265, Time: 79.33 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2016 - Best C: 100, Best ACC: 0.0289, Time: 79.93 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2017 - Best C: 100, Best ACC: 0.0274, Time: 77.08 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2018 - Best C: 100, Best ACC: 0.031, Time: 76.30 seconds
Year 2019 - Best C: 100, Best ACC: 0.0312, Time: 70.07 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2020 - Best C: 100, Best ACC: 0.0238, Time: 72.01 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2021 - Best C: 100, Best ACC: 0.0418, Time: 84.40 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Year 2022 - Best C: 100, Best ACC: 0.0406, Time: 86.43 seconds
Total time: 4644.36 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
############################################
# DECISION TREE CLASSIFICATION MODEL
############################################

model_name = 'DT_class_roll5_MSE'
start_time2 = time.time()  # Start timing

# Predefined set of max_depth values for hyperparameter tuning
HP1 = [3, 5, 10, 15, 25, None] # max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 6

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_mse = np.inf
    
    # Hyperparameter tuning for max_depth
    for hp1 in HP1:
        model = DecisionTreeClassifier(max_depth=hp1)
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
        mse = mean_squared_error(y_tune, probabilities)  # Calculate MSE
        
        if mse < best_mse:  # Lower MSE is better
            best_mse = mse
            best_HP1 = hp1
    
    # Retrain on the entire training window with the best max_depth value
    model = DecisionTreeClassifier(max_depth=best_HP1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best Max Depth: {best_HP1}, Best ACC: {round(best_mse,4)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()  # End timing
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1976 - Best Max Depth: 5, Best ACC: 0.0374, Time: 7.97 seconds
Year 1977 - Best Max Depth: 5, Best ACC: 0.033, Time: 12.02 seconds
Year 1978 - Best Max Depth: 5, Best ACC: 0.0347, Time: 13.40 seconds
Year 1979 - Best Max Depth: 10, Best ACC: 0.0281, Time: 14.19 seconds
Year 1980 - Best Max Depth: 5, Best ACC: 0.0257, Time: 13.52 seconds
Year 1981 - Best Max Depth: 5, Best ACC: 0.0215, Time: 13.48 seconds
Year 1982 - Best Max Depth: 5, Best ACC: 0.0238, Time: 12.90 seconds
Year 1983 - Best Max Depth: 5, Best ACC: 0.0302, Time: 12.71 seconds
Year 1984 - Best Max Depth: 5, Best ACC: 0.0327, Time: 12.06 seconds
Year 1985 - Best Max Depth: 10, Best ACC: 0.0422, Time: 14.87 seconds
Year 1986 - Best Max Depth: 10, Best ACC: 0.043, Time: 18.68 seconds
Year 1987 - Best Max Depth: 10, Best ACC: 0.0406, Time: 22.30 seconds
Year 1988 - Best Max Depth: 10, Best ACC: 0.0428, Time: 26.21 seconds
Year 1989 - Best Max Depth: 10, Best ACC: 0.0499, Time: 28.40 seconds
Year 1990 - Best Max Depth: 10,

In [12]:
############################################
# RF CLASSIFICATION MODEL
############################################

model_name = 'RF_class_roll5_MSE'
start_time2 = time.time()


# Predefined set of values for hyperparameter tuning
HP1 = [10, 50, 100, 200]  # Possible values for n_estimators
HP2 = [5, 10, 15, None]  # Possible values for max_depth

# Update the column name for storing Decision Tree regression predictions
df[model_name] = np.nan

# Ensure the new column is in the prediction_cols list
if model_name not in prediction_cols:
    prediction_cols.append(model_name)

start_modeling_year = df['year'].min() + 6

for year in range(start_modeling_year, df['year'].max() + 1):
    start_time = time.time()  # Timing each iteration
    
    train_start_year = max(year - rolling_window, df['year'].min())
    train_data = df[(df['year'] >= train_start_year) & (df['year'] < year)]
    
    tuning_data = train_data[train_data['year'] == year - 1]
    actual_train_data = train_data[train_data['year'] < year - 1]
    
    X_train = actual_train_data[X_col]
    y_train = actual_train_data['y']
    
    X_tune = tuning_data[X_col]
    y_tune = tuning_data['y']
    
    best_HP1 = None
    best_HP2 = None
    best_mse = np.inf

    # Hyperparameter tuning
    for hp1 in HP1:
        for hp2 in HP2:
            model = RandomForestClassifier(n_estimators=hp1, max_depth=hp2, random_state=42, n_jobs=-1)
            model.fit(X_train, y_train)
            predictions = model.predict_proba(X_tune)[:, 1]  # Get probabilities of the positive class
            mse = mean_squared_error(y_tune, predictions)  # Calculate MSE
            
            # Lower MSE is better, so we invert the logic used for accuracy
            if mse < best_mse:
                best_mse = mse
                best_HP1 = hp1
                best_HP2 = hp2
    
    # Retrain on the entire training window with the best max_depth value
    model = model = RandomForestClassifier(n_estimators=best_HP1, max_depth=best_HP2, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Predict for the next year
    next_year_data = df[df['year'] == year]
    X_next_year = next_year_data[X_col]
    
    if not X_next_year.empty:
        next_year_predictions = model.predict_proba(X_next_year)[:, 1]
        df.loc[df['year'] == year, model_name] = next_year_predictions
    
    end_time = time.time()
    iteration_time = end_time - start_time
    
    print(f"Year {year} - Best n_estimators: {best_HP1}, Best max_depth: {best_HP2 if best_HP2 is not None else 'None'}, Best ACC: {round(best_mse, 2)}, Time: {iteration_time:.2f} seconds")

end_time2 = time.time()
print(f"Total time: {end_time2 - start_time2:.2f} seconds")

Year 1976 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.06, Time: 113.43 seconds
Year 1977 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.04, Time: 156.64 seconds
Year 1978 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.04, Time: 185.41 seconds
Year 1979 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.03, Time: 197.16 seconds
Year 1980 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.03, Time: 190.27 seconds
Year 1981 - Best n_estimators: 50, Best max_depth: None, Best ACC: 0.03, Time: 171.29 seconds
Year 1982 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.03, Time: 184.45 seconds
Year 1983 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.04, Time: 175.47 seconds
Year 1984 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.04, Time: 174.30 seconds
Year 1985 - Best n_estimators: 200, Best max_depth: None, Best ACC: 0.05, Time: 206.89 seconds
Year 1986 - Best n_estimators: 100, Best max_depth:

## Forming Portfolios, Value-weighted portfolio returns

In [13]:
df.head()

Unnamed: 0,PERMNO,date,y,RET,LME,ME,bull_D,bear_D,bull_W,bear_W,bull_M,bear_M,SPREAD_L,LTURNOVER,IVOL,IVOL2,IVOL3,STR,LTR,IMOM,MOM,PCTHIGH,MVOL,MVOL2,MVOL3,LMKT,LMKT2,LMKT3,MMOM,MIMOM,MLTR,release_L,LAT,DPI2A_L,CTO_L,D2A_L,NOA_L,OL_L,PCM_L,FC2Y_L,INVEST_L,RNA_L,S2E_L,PROF_L,PM_L,ATO_L,ROA_L,FCF_L,A2ME_L,B2ME_L,S2P_L,ROE_L,LEV_L,TQ_L,year,base_ols_default,ols_default,ridge_clas_roll5_MSE,DT_class_roll5_MSE,RF_class_roll5_MSE
0,31464,1970-04-30,0,-0.102941,41412.0,37149.0,0.0,0.0,0.0,0.166667,0.0,0.416667,0.003421,6e-06,4.6e-05,3.4e-05,4.7e-05,0.039638,0.731272,0.61217,0.670081,0.413984,0.012852,0.012583,0.01042,0.564331,0.723312,0.385732,0.408401,0.496787,0.647683,1.0,0.000119,0.006682,0.000433,0.000149,0.000386,0.00062,0.907359,0.105362,0.000386,0.261588,0.217478,0.304686,0.95635,0.008189,0.785605,0.141009,0.001061,0.797682,0.010881,0.522705,0.000224,1.6e-05,1970,,,,,
1,31464,1970-05-31,0,-0.065574,37149.0,34713.0,0.0,0.0,0.0,0.0,0.0,0.5,0.004576,6e-06,1.3e-05,4.6e-05,3.4e-05,0.035635,0.722584,0.625955,0.672735,0.400651,0.017869,0.012852,0.012583,0.309299,0.564331,0.723312,0.385646,0.535558,0.630483,0.0,0.000119,0.006682,0.000433,0.000149,0.000386,0.00062,0.907359,0.105362,0.000386,0.261588,0.217478,0.304686,0.95635,0.008189,0.785605,0.141009,0.001061,0.797682,0.010881,0.522707,0.00025,1.4e-05,1970,,,,,
2,31464,1970-06-30,0,-0.14386,34713.0,29232.0,0.0,0.0,0.0,0.25,0.0,0.583333,0.00653,8e-06,6.2e-05,1.3e-05,4.6e-05,0.037131,0.721443,0.622612,0.670797,0.455396,0.127382,0.017869,0.012852,0.414013,0.309299,0.564331,0.279599,0.494936,0.666119,0.0,0.000119,0.006682,0.000433,0.000149,0.000386,0.00062,0.907359,0.105362,0.000386,0.261588,0.217478,0.304686,0.95635,0.008189,0.785605,0.141009,0.001061,0.797682,0.010881,0.522708,0.000267,1.3e-05,1970,,,,,
3,31464,1970-07-31,0,-0.125,29232.0,25578.0,0.0,0.083333,0.0,0.0,0.0,0.666667,0.004847,4e-06,2e-05,6.2e-05,1.3e-05,0.033998,0.698329,0.624735,0.678001,0.447986,0.035887,0.127382,0.017869,0.444076,0.414013,0.309299,0.282368,0.548597,0.589858,1.0,0.000119,0.006682,0.000424,0.000149,0.000386,0.00062,0.907347,0.105364,0.000386,0.261588,0.217477,0.304686,0.956351,0.008181,0.785544,0.140986,0.001061,0.797682,0.010711,0.522684,0.000316,1.1e-05,1970,,,,,
4,31464,1970-08-31,1,0.047619,25578.0,26796.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003877,5e-06,4.7e-05,2e-05,6.2e-05,0.034753,0.687449,0.638067,0.679698,0.391849,0.023573,0.035887,0.127382,0.766624,0.444076,0.414013,0.294738,0.536452,0.498298,0.0,0.000119,0.006682,0.000424,0.000149,0.000386,0.00062,0.907347,0.105364,0.000386,0.261588,0.217477,0.304686,0.956351,0.008181,0.785544,0.140986,0.001061,0.797682,0.010711,0.522684,0.000361,1e-05,1970,,,,,


In [14]:
prediction_cols
# prediction_cols = ['logit_default','OLS_default','logit_roll6','DT_reg_roll']

['base_ols_default',
 'ols_default',
 'ridge_clas_roll5_MSE',
 'DT_class_roll5_MSE',
 'RF_class_roll5_MSE']

In [15]:
portfolio = df[['date', 'RET', 'ME', 'y'] + prediction_cols].copy()
portfolio['date'] = pd.to_datetime(portfolio['date'])

# drop rows with missing values
portfolio.dropna(inplace=True)

portfolio.head()

Unnamed: 0,date,RET,ME,y,base_ols_default,ols_default,ridge_clas_roll5_MSE,DT_class_roll5_MSE,RF_class_roll5_MSE
80751,1976-01-31,0.147541,99391.25,1,0.429566,0.541998,0.874848,0.846724,0.615
80752,1976-01-31,0.266187,87802.0,1,0.65125,0.657648,0.966976,0.954098,0.825
80753,1976-01-31,0.124031,113172.5,1,0.608244,0.652779,0.999998,1.0,0.945
80754,1976-01-31,0.319444,12801.25,1,1.688311,1.560148,1.0,1.0,0.915
80755,1976-01-31,0.0,8228.5,0,0.429613,0.383483,0.18269,0.0,0.14


In [16]:
portfolio.tail()

Unnamed: 0,date,RET,ME,y,base_ols_default,ols_default,ridge_clas_roll5_MSE,DT_class_roll5_MSE,RF_class_roll5_MSE
1795158,2022-07-31,-0.211268,360376.8,0,0.528958,0.456078,0.042556,0.0,0.23
1795159,2022-07-31,0.020794,956286.0,1,0.819094,0.816904,1.0,1.0,0.92
1795160,2022-07-31,0.066548,921952.8,1,0.71127,0.700782,0.968239,0.993929,0.93
1795161,2022-07-31,0.087452,121602.8,1,0.849967,0.9006,0.997182,0.993929,0.92
1795162,2022-07-31,0.323765,931110600.0,1,0.711275,0.814605,0.995536,0.993929,0.89


In [17]:
# Initialize an empty DataFrame to store value-weighted returns for each model
vwreturns = pd.DataFrame(portfolio['date'].unique(), columns=['date'])  # Ensures all dates are included

for pred_col in prediction_cols:
    # Calculate deciles for this prediction
    decile_col = f'decile_{pred_col}'
    portfolio[decile_col] = portfolio.groupby(['date'])[pred_col].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    
    # Determine position based on deciles
    position_col = f'position_{pred_col}'
    portfolio[position_col] = np.where(portfolio[decile_col] == 9, 1, np.where(portfolio[decile_col] == 0, -1, 0))
    
    # Calculate the value-weighted return for this prediction
    vwret_col = f'vwreturn_{pred_col}'
    vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
    
    # Merge the temporary value-weighted returns with the main vwreturns DataFrame
    vwreturns = vwreturns.merge(vwreturns_temp, on='date', how='left')

# Ensure the 'date' column is the first column and is sorted
vwreturns = vwreturns.sort_values('date').reset_index(drop=True)


  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)
  vwreturns_temp = portfolio.groupby('date').apply(lambda x: np.sum(x['RET'] * x['ME'] * x[position_col]) / np.sum(x['ME'])).reset_index(name=vwret_col)


In [18]:
vwreturns.head()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE
0,1976-01-31,0.039535,0.030443,0.022083,-0.001281,0.023344
1,1976-02-29,0.022362,0.015261,0.013882,-0.009918,0.019715
2,1976-03-31,0.011892,0.010497,0.009977,0.005947,0.013456
3,1976-04-30,0.007972,0.010684,0.007117,0.026945,0.010274
4,1976-05-31,0.019215,0.020038,0.016704,0.025408,0.016797


### Compare to market data

In [19]:
#market = pd.read_csv('FF3_clean.csv')
market = pd.read_csv('/kaggle/input/sign-prediction-datasets/FF3_clean.csv')

In [20]:
market.head()

Unnamed: 0,date,Mkt-RF,SMB,HML,RF
0,1926-07-31,2.96,-2.56,-2.43,0.22
1,1926-08-31,2.64,-1.17,3.82,0.25
2,1926-09-30,0.36,-1.4,0.13,0.23
3,1926-10-31,-3.24,-0.09,0.7,0.32
4,1926-11-30,2.53,-0.1,-0.51,0.31


In [21]:
# create a new 'Mkt' which is a sum of Mkt-RF and RF
market['Mkt'] = market['Mkt-RF'] + market['RF']

# divide all columns by 100 except 'date'
market.iloc[:, 1:] = market.iloc[:, 1:] / 100

#set the 'date' column to datetime format
market['date'] = pd.to_datetime(market['date'])

# merge the market data (only date and Mkt columns) with the vwreturns DataFrame
vwreturns = vwreturns.merge(market[['date', 'Mkt']], on='date', how='left')

# transform all columns (except 'date') to a log: log(x+1) and save the result as lvwreturns
lvwreturns = vwreturns.copy()
lvwreturns.iloc[:, 1:] = np.log(vwreturns.iloc[:, 1:] + 1)

In [22]:
vwreturns.head()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE,Mkt
0,1976-01-31,0.039535,0.030443,0.022083,-0.001281,0.023344,0.1263
1,1976-02-29,0.022362,0.015261,0.013882,-0.009918,0.019715,0.0066
2,1976-03-31,0.011892,0.010497,0.009977,0.005947,0.013456,0.0272
3,1976-04-30,0.007972,0.010684,0.007117,0.026945,0.010274,-0.0107
4,1976-05-31,0.019215,0.020038,0.016704,0.025408,0.016797,-0.0097


In [23]:
lvwreturns.head()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE,Mkt
0,1976-01-31,0.038773,0.029989,0.021843,-0.001282,0.023075,0.118938
1,1976-02-29,0.022116,0.015146,0.013786,-0.009967,0.019523,0.006578
2,1976-03-31,0.011821,0.010442,0.009928,0.00593,0.013366,0.026837
3,1976-04-30,0.00794,0.010628,0.007092,0.026588,0.010222,-0.010758
4,1976-05-31,0.019032,0.01984,0.016566,0.025091,0.016657,-0.009747


In [24]:
lvwreturns.describe()

Unnamed: 0,date,vwreturn_base_ols_default,vwreturn_ols_default,vwreturn_ridge_clas_roll5_MSE,vwreturn_DT_class_roll5_MSE,vwreturn_RF_class_roll5_MSE,Mkt
count,559,559.0,559.0,559.0,559.0,559.0,559.0
mean,1999-05-01 04:22:45.295169920,0.016911,0.016571,0.008193,0.022117,0.014239,0.009303
min,1976-01-31 00:00:00,-0.028077,-0.020252,-0.024542,-0.054603,0.001003,-0.2567
25%,1987-09-15 00:00:00,0.011065,0.010928,0.002257,0.008078,0.009434,-0.015672
50%,1999-04-30 00:00:00,0.015014,0.014409,0.006101,0.015474,0.012861,0.013804
75%,2010-12-15 12:00:00,0.02036,0.019833,0.01208,0.029988,0.017341,0.038162
max,2022-07-31 00:00:00,0.07791,0.077408,0.039721,0.205754,0.066362,0.127953
std,,0.009251,0.00866,0.007458,0.023861,0.00749,0.045169


In [25]:
# save the lvwreturns and portfolio DataFrame to a parquet file into 'outputs' folder

# for reproducibility and visualization purposes
lvwreturns.to_parquet('fin_lvwreturns_class1.parquet')
portfolio.to_parquet('fin_portfolio_class1.parquet')

# save vwreturns DataFrame to a .dta file into 'outputs' folder
#vwreturns.to_stata('outputs/vwreturns.dta') # for backtasting in R - we need normal returns, not log returns
