**Lecture 5 : Return Prediction with Random Forest **

In [None]:
# Connecting the Python Code with the google drive
from google.colab import drive
drive.mount("/content/drive")

**Data Description**

Input: /content/drive/MyDrive/MAF data/Features_seven_signals.csv.zip
Created by: Lec5_Create Standardized Features.ipynb

 **Satandardized Features**

* marketcap: market cap

* investment: 12-month increase in total assets

* accruals: ib - oancf

* b2m: ceq/marketcap

* ret_2_12: momentum (stock returns from month t-12 to t-2

* CashFlow2AT: oancf/Assets (AT)
*new_issue: Stocks issued over the previous 12 months










In [None]:
# Import necessary libraries

import sys
import os
import platform
import random
from math import sqrt, floor, ceil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from multiprocessing import Process
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree

from datetime import datetime

In [None]:
# Importing Data from google drive
dat = pd.read_csv("/content/drive/MyDrive/MAF data/Features_seven_signals.csv.zip")      #Read features + RET

# Date-time Manipulations
dat["date"] = pd.to_datetime(dat["date"])                                       # "date" as a DateTime object
dat["yr"] = dat["date"].dt.year                                                 # Extracting year
dat["month"] = dat["date"].dt.month                                             # Extracting month
dat.sort_values(by = 'date', inplace = True)                                    # Sorting dataframe by date
dat["month_num"] = (dat['date']).rank(method = "dense")

In [None]:

dat = dat[dat.yr >= 1990].copy()                                                       #limited availablity of 'oancf' pre-1990
#Target is 'adj_return' = 'RET' - 'mean_ret' (average returns for all stocks that month) (why  adj_return and not RET?)

# Computing monthly mean returns and storing in columns 'mean_ret'
dat['mean_ret'] = dat.groupby('date')['RET'].transform('mean')  # Compute mean returns per date
dat['adj_ret'] = dat['RET'] - dat['mean_ret']                                   # Adjusted Returns - subtract the mean so that the target is return minus average returns for all stocks.

# Printing Output
print("***********************************************************")
print("Pre-processed Dataframe containing all signals")
print("***********************************************************")
dat

In [None]:
len(dat.PERMNO.unique())

In [None]:
dat.columns

In [None]:
## Defining Random Forest Regressor function with output features. The next block of codes call this function
# see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html for details

def random_forest_rolling(train_X, train_y, test_X, test_y) :

  # Running Random forest model
  regressor = RandomForestRegressor(n_estimators=trials, max_features= num_features, max_depth= max_tree_depth, random_state = 0).fit(train_X, train_y)    # Random Forest Regressor Model
                                                                            # random_state controls both the randomness of bagging and the sampling of the features to consider when looking for the best split
                                                                            #  at each node. Set equal to an integer for reproducability

  # Storing & returning the predicted returns based on the model
  prediction = regressor.predict(test_X)                                                                                                                    # Storing the predicted returns from the random forest model
  R2_score = regressor.score(test_X, test_y)                                                                                                                # Storing the R2 Score between predicted & observed values
  return prediction, R2_score

In [None]:
## Random Forest : Input parameters
y_column_name = 'adj_ret'                                                       #target,
features = ['marketcap_pct_rank', 'investment_pct_rank', 'accruals_pct_rank', 'b2m_pct_rank', 'ret_2_12_pct_rank',	'CashFlow2TA_pct_rank', 'new_issue_pct_rank']
#features = ['b2m_pct_rank',  'CashFlow2TA_pct_rank']
predicted_ret_df = pd.DataFrame()                                                                           # Initializing "predicted_ret_df". This dataframe will store all dataframes
#____________________________________________________
rolling_window = 60                                                                                        # Training data is comprised of all observation within this window
#____________________________________________________
last_start_month = int(dat["month_num"].max()) - rolling_window                                            # identify the month to end the for loop and get predictions for the month in the sample period
start_time = datetime.now()
print('start_time  =', start_time)
for t in range(0,last_start_month):                                                                       # Iterating over various rolling windows
#for t in range(1,2):
  print('t =', t, 'time for loop = ',   datetime.now() -  start_time)
  # Input Training data & Test Data Parameters
  train_month_start = t                                                                                     # Training data starts from this month number
  train_month_end = train_month_start + rolling_window                                                      # Training data ends at this month number
                                                                                   # Adjusted Return as the target variable
  reg_factors = features

  # Input Random Forest Parameters parameter
#____________________________________________________
  trials = 100;                                                                                             # number of trials or number of trees in each forest. Experiment with other values as well
  max_tree_depth = 3;                                                                                       # maximum tree depth. Experiment with other values as well
#____________________________________________________
  num_features = int(np.sqrt(len(features)))                                                                # Randomly selects "num_features" in each node when looking for the best split.
                                                                                                            # Therefore, num_features <= len(reg_factors). it is recommended that num_features = sqrt(len(reg_factors))
                                                                                                            # With 7 features,  int(np.sqrt(len(reg_factors))) =2. Experiment with other values of num_features, eg. 3

  # Extracting Training and Test Data
  train_dat = dat[ (dat['month_num'] <= (train_month_end)) & (dat['month_num'] >= (train_month_start)) ]    # Extracting Training Data from "train_month_start" till "train_month_end"
  test_dat = dat[ (dat['month_num'] == train_month_end + 1) ]                                               # Extracting Test Data as data on month "train_month_end + 1"
  train_X = train_dat[features]                                                                             # Dropping non-signal columns from training data
  test_X = test_dat[features]                                                                               # Dropping non-signal columns from test data
  train_y = train_dat[y_column_name]                                                                        # Extracting Y values (adjusted returns) in training data
  test_y = test_dat[y_column_name]                                                                          # Extracting Y values (adjusted returns) in test data
  output_df = random_forest_rolling(train_X, train_y, test_X, test_y)                                       # Calling Random Forest Function, storing predicted_returns and plotting the output (you can change figure size in random_forest function : plt.figure )

  ## Predicticted Returns Calculation
  test_dat = pd.DataFrame()                                                                                 # Dataframe Initialization
  test_dat = dat[(dat['month_num'] == train_month_end + 1)].copy()                                          # Extracts "Test Data" or "t + 1" month data, where training ends at month "t"
  test_dat.loc[:,"predicted_adj_ret"] = output_df[0]                                                        # Assigning "predicted returns" to Test Data
  test_dat.rename(columns = {"adj_ret" : "actual_adj_ret"},inplace = True)
  test_dat = test_dat[["PERMNO","yr","month","predicted_adj_ret", "actual_adj_ret", "RET"]]                 # Keeping only relevant colums
  predicted_ret_df = pd.concat([predicted_ret_df, test_dat], axis =0)                                       # Consolding Predicted Returns in "predicted_ret_df"
 #______________________________________________________________________________________________________________________
  #This block saves the output in 50 month chunks. Use this option if a full run takes too long and gets interrupted
  if t % 50 == 0:
  #   Create a new DataFrame for the current chunk
   chunk_df = predicted_ret_df[t-50:t].copy()
    # Save the chunk to a CSV file
   chunk_df.to_csv(f'/content/drive/MyDrive/MAF data/rolling_rf_pred_returns_{t}.csv', index=False)
#______________________________________________________________________________________________________________________
predicted_ret_df.to_csv(f"/content/drive/MyDrive/MAF data/rolling_rf_pred_returns_{trials}__{rolling_window}_{max_tree_depth}.csv")             # Output saved to "rolling_rf_pred_returns.csv" file

print('over')

In [None]:
predicted_ret_df

In [None]:
rolling_window = 60
trials = 100                                                                                             # number of trials or number of trees in each forest. Experiment with other values as well
max_tree_depth = 3
predicted_ret_df = pd.read_csv(f"/content/drive/MyDrive/MAF data/rolling_rf_pred_returns_{trials}__{rolling_window}_{max_tree_depth}.csv")             # read output previously saved to "rolling_rf_pred_returns.csv" file


In [None]:
# Decile ranks
# At times there  are multiple stocks with the same expected returns. In this case, the number of stocks would vary across decile portfolio.
# Step 1 gives a higher rank_order to the first stock when there are multiple stocks with the same expected returns.
# Step 1: Rank the data within each year and month group
predicted_ret_df['rank_order'] = predicted_ret_df.groupby(['yr', 'month'])['predicted_adj_ret'] \
                                           .rank(method='first')
# Step 2: Apply qcut to the ranks to create equal-sized bins
predicted_ret_df['rank'] = predicted_ret_df.groupby(['yr', 'month'])['rank_order'] \
                                             .transform(lambda x: pd.qcut(x, 10, labels=False))
predicted_ret_df.reset_index(inplace =True,drop = True)                                                                                                                            # Reseting Index

## Print Output : Predicted_Ret_Df
print("****************************************************************************")
print("DataFrame with Adjusted Predicted Returns, Deciles(rank) ")
print("****************************************************************************")
predicted_ret_df

In [None]:
# Monthly Mean Portfolio Returns
meanret = predicted_ret_df.groupby(['yr','month', 'rank'])['RET'].mean().to_frame()      # Calculating average return for each decile
meanret = meanret.unstack(level = -1).copy()                                             # Unstacking the grouped dataframe
meanret[('RET', 'diff')] = meanret[('RET', 9)] -  meanret[('RET', 0)]                    # Calculating the long short returns of the portfolio by substracting "rank 0" avg. return from "rank 9" avg. return

nmon = len(meanret)                                                                      # nmon in number of months
meanret = meanret.stack(level = -1,future_stack= True).copy()                            # Stacking the dataframe to year-month index level

# Overall Portfolio Returns Statistics
global_mean = meanret.groupby('rank')['RET'].agg(["mean", "std"])                      # mean and standard deviation of regression coefficients
global_mean['t-stat'] =np.sqrt(nmon - 1) *  global_mean['mean']/global_mean['std']       # t-statistics calculation
global_mean

In [None]:
predicted_ret_df.to_csv(f"/content/drive/MyDrive/MAF data/dummy.csv.zip", compression='zip')

In [None]:
meanret