**Lecture  : XGBoosting**

In [None]:
# Connecting the Python Code with the google drive
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!pip install xgbtune

In [None]:
# Import necessary libraries
import sys
import os
import platform
import random
from math import sqrt, floor, ceil
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from multiprocessing import Process
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgbtune import tune_xgb_model

**Data Description**

Input: /content/drive/MyDrive/MAF data/Features_seven_signals.csv.zip
Created by: Lec5_Create Standardized Features.ipynb

 **Satandardized Features**

* marketcap: market cap

* investment: 12-month increase in total assets

* accruals: ib - oancf

* b2m: ceq/marketcap

* ret_2_12: momentum (stock returns from month t-12 to t-2

* CashFlow2AT: oancf/Assets (AT)
*new_issue: Stocks issued over the previous 12 months










In [None]:
dat = pd.read_csv("/content/drive/MyDrive/MAF data/Features_seven_signals.csv.zip")
dat


In [None]:
dat["PERMNO"].nunique()

In [None]:
# Data Pre-processing

# Date-time Manipulations
dat["date"] = pd.to_datetime(dat["date"])                                       # "date" as a DateTime object# Informing python that "date" is a DateTime object for subsequent datetime manipulations like adding or subtracting months
dat["yr"] = dat["date"].dt.year                                                 # Extracting year
dat["month"] = dat["date"].dt.month                                             # Extracting month
dat.sort_values(by = 'date', inplace = True)                                    # Sorting dataframe by date
dat["month_num"] = (dat['date']).rank(method = "dense")                         # Assigning a rank for each month sequentially


# Calculating Mean & Adjusted Returns
grouped = pd.DataFrame();                                                       # Initializing DataFrame
grouped["mean"] = dat.groupby('date').apply(lambda x : x["RET"].mean())       # Computing monthly mean returns
dat['mean_ret'] = dat['date'].map(grouped["mean"])                            # Assigning mean returns to "dat["mean_ret"]" column
dat['adj_ret'] = dat['RET'] - dat['mean_ret']                                   # Adjusted Returns - subtract the mean so that the target is return minus average returns for all stocks.

# Reordering Columns
dat = dat[['date','PERMNO', 'marketcap_pct_rank','new_issue_pct_rank',
           'investment_pct_rank', 'accruals_pct_rank', 'b2m_pct_rank',
           'ret_2_12_pct_rank','CashFlow2TA_pct_rank', 'RET', 'mean_ret', 'adj_ret', 'yr',
           'month', 'month_num']]

# Printing Output
print("***********************************************************")
print("Pre-processed Dataframe containing all signals")
print("***********************************************************")
dat

In [None]:
# Minor Pre-processing to extract available factors
y_column_name = 'adj_ret'                                                                                               # Target variable is adjusted returns
row_key_column_names = ['yyyymm', 'PERMNO', 'month', 'yr', 'month_num', 'RET', 'mean_ret','date']                       # Columns that are not signals ( either identifiers or target variables )
feature_column_names = [ x for x in dat.columns.values if (x not in row_key_column_names) & (x != y_column_name) ]      # Remaining columns are the varaibles that can potentailly be used as signals
print("************************************************************************************************************")
print("list of Input signals/features")
print(feature_column_names)
print("************************************************************************************************************")

See [documentation](https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fxgboost.readthedocs.io%2Fen%2Fstable%2Fparameter.html&data=05%7C02%7Cjegadeesh%40emory.edu%7C23f5c662701c490daf9708dd4f674f9f%7Ce004fb9cb0a4424fbcd0322606d5df38%7C0%7C0%7C638754027358065687%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=E%2Bi9ZLJ3gDQ1ImVumHHm1pknFFAksYjBE8orFkqkeYs%3D&reserved=0) here for  additional details

In [None]:
## Defining XGBoost Regressor function with output features. The next block of codes call this function

def xgboost_rolling(train_X, train_y, test_X, test_y) :
  """
  params = {'eval_metric': 'rmse'}
  params, round_count = tune_xgb_model(params, train_X, train_y)
  print(params)
  print(round_count)
  """
  params = {'eval_metric': 'rmse', 'max_depth': 4, 'min_child_weight': 1, 'gamma': 0.2, 'subsample': 1.0, 'colsample_bytree': 0.8, 'alpha': 0, 'lambda': 1, 'learning_rate' : 0.3, 'seed': 0}

  regressor_main =xgb.XGBRegressor(**params)

  regressor_main.fit(train_X, train_y)
  prediction = regressor_main.predict(test_X)     # Storing the R2 Score between predicted & observed values
  R2_score = regressor_main.score(test_X, test_y)
  #print(prediction, R2_score)
  return prediction, R2_score, regressor_main


In [None]:
## XGBoost

predicted_ret_df = pd.DataFrame()                                                                           # Initializing "predicted_ret_df". This dataframe will store all dataframes
rolling_window = 60                                                                                         # Training data is comprised of all observation within this window
start_month_limit = int(dat["month_num"].max()) - rolling_window                                            # last "rolling window starting point" that is possible

for t in range(1,start_month_limit):                                                                        # Iterating over various rolling windows
  if t % 10 == 0:                                                                                           #print t every 10 months
    print(t)
  # Input Training data & Test Data Parameters
  train_month_start = t                                                                                     # Training data start from this month number, input any number from 1 till 380 (based on dat["month_num"])
  train_month_end = train_month_start + rolling_window                                                      # Training data ends at this month number
  y_column_name = 'adj_ret'                                                                                 # Adjusted Return as the target variable
  reg_factors = feature_column_names




  # Extracting Training and Test Data
  train_dat = dat[ (dat['month_num'] <= (train_month_end)) & (dat['month_num'] >= (train_month_start)) ]    # Extracting Training Data from "train_month_start" till "train_month_end"
  test_dat = dat[ (dat['month_num'] == train_month_end + 1) ]                                               # Extracting Test Data as data on month "train_month_end + 1"
  train_X = train_dat[reg_factors]                                                                          # Dropping non-signal columns from training data
  test_X = test_dat[reg_factors]                                                                            # Dropping non-signal columns from test data
  train_y = train_dat[y_column_name]                                                                        # Extracting Y values (adjusted returns) in training data
  test_y = test_dat[y_column_name]                                                                          # Extracting Y values (adjusted returns) in test data
  output_df = xgboost_rolling(train_X, train_y, test_X, test_y)                                             # Calling Random Forest Function, storing predicted_returns and plotting the output (you can change figure size in random_forest function : plt.figure )

  ## Predicticted Returns Calculation
  test_dat = pd.DataFrame()                                                                                 # Dataframe Initialization
  test_dat = dat[(dat['month_num'] == train_month_end + 1)].copy()                                          # Extracts "Test Data" or "t + 1" month data, where training ends at month "t"
  test_dat.loc[:,"predicted_adj_ret"] = output_df[0]                                                        # Assigning "predicted returns" to Test Data
  test_dat.rename(columns = {"adj_ret" : "actual_adj_ret"},inplace = True)
  test_dat = test_dat[["PERMNO","yr","month","predicted_adj_ret", "actual_adj_ret", "RET"]]                 # Keeping only relevant colums
  predicted_ret_df = pd.concat([predicted_ret_df, test_dat], axis =0)                                       # Consolding Predicted Returns in "predicted_ret_df"

predicted_ret_df.to_csv("/content/drive/MyDrive/MAF data/rolling_xgb_pred_returns.csv")             # Output saved to "rolling_rf_pred_returns.csv" file

print("****************************************************************************")
print("Dataframe with predicted returns")
print("****************************************************************************")
predicted_ret_df


In [None]:
# @title Plot Feature Importance
import matplotlib.pyplot as plt

model = output_df[2]

# Extract feature importance
feature_importance = model.feature_importances_
feature_names = train_X.columns

# Sort feature importance
sorted_indices = feature_importance.argsort()  # Get indices for sorting
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Plotting feature importance
plt.figure(figsize=(10, 6))
plt.barh(sorted_feature_names, sorted_feature_importance)
plt.xlabel('Feature Importance')
plt.ylabel('Feature Names')
plt.title('XGBoost Model Signal Importance ')
plt.show()

In [None]:
# @title Extracting Tree Details
# Extract tree details as a DataFrame
"""
"Tree": The index of the tree in the ensemble. XGBoost is an ensemble of multiple trees, so this indicates which tree the node belongs to.

"Node": The ID of the node within the tree. In the first row, 0 is the root node, and 0-1, 0-2, etc., are child nodes.

"ID" : Tree Node Unique Id

"Feature": The feature used for splitting at this node.

"Split": The threshold value used for the split.

"Yes": The ID of the child node to go to if the condition (feature ≤ split threshold) is true.

"No": The ID of the child node to go to if the condition (feature > split threshold) is false.

"Missing": The ID of the child node to go to if the feature value is missing (NaN).

"Gain": The improvement in the objective function (e.g., loss reduction) achieved by splitting at this node.For leaves, it represents prediction value in a broader sense.

"Cover": The number of samples associated with this node.

"""

tree_df = model.get_booster().trees_to_dataframe()
del tree_df["Category"]
tree_df[tree_df["Tree"]==0] # we can change the value based on which tree we want to see

In [None]:
# @title Plotting A Sample XGBoost Tree
from xgboost import to_graphviz
import graphviz
from IPython.display import Image

# Generating tree
graph = to_graphviz(model, num_trees=0, rankdir='LR') # num_trees is the ith tree that we want to visualize
file_path = graph.render('xgboost_tree', format='png')


# Plot
Image(filename=file_path)

In [None]:
predicted_ret_df = pd.read_csv("/content/drive/MyDrive/MAF data/rolling_xgb_pred_returns.csv")             # read output previously saved to "rolling_rf_pred_returns.csv" file


In [None]:
# Decile ranks
# At times there  are multiple stocks with the same expected returns. In this case, the number of stocks would vary across decile portfolio.
# Step 1 gives a higher rank_order to the first stock when there are multiple stocks with the same expected returns.
# Step 1: Rank the data within each year and month group
predicted_ret_df['rank_order'] = predicted_ret_df.groupby(['yr', 'month'])['predicted_adj_ret'] \
                                           .rank(method='first')
# Step 2: Apply qcut to the ranks to create equal-sized bins
predicted_ret_df['rank'] = predicted_ret_df.groupby(['yr', 'month'])['rank_order'] \
                                             .transform(lambda x: pd.qcut(x, 10, labels=False))
predicted_ret_df.reset_index(inplace =True,drop = True)                                                                                                                            # Reseting Index

## Print Output : Predicted_Ret_Df
print("****************************************************************************")
print("DataFrame with Adjusted Predicted Returns, Deciles(rank) ")
print("****************************************************************************")
predicted_ret_df

In [None]:
# Decile formation & Predicted Returns Calculation
predicted_ret_df['rank'] = predicted_ret_df.groupby(['yr','month'])['predicted_adj_ret'].transform(lambda x: pd.qcut(x, 10, duplicates='drop',labels=False))         # Calculating Decile Ranks based on the Predicted Returns
predicted_ret_df.reset_index(inplace =True,drop = True)                                                                                                                            # Reseting Index

## Print Output : Predicted_Ret_Df
print("****************************************************************************")
print("DataFrame with Adjusted Predicted Returns, Deciles(rank) ")
print("****************************************************************************")
predicted_ret_df

In [None]:
count = predicted_ret_df.groupby(['yr','month']).count()
count

In [None]:
# Monthly Mean Portfolio Returns
meanret = predicted_ret_df.groupby(['yr','month', 'rank'])['RET'].mean().to_frame()      # Calculating average return for each decile
meanret = meanret.unstack(level = -1).copy()                                             # Unstacking the grouped dataframe
meanret[('RET', 'diff')] = meanret[('RET', 9)] -  meanret[('RET', 0)]                    # Calculating the long short returns of the portfolio by substracting "rank 0" avg. return from "rank 9" avg. return

nmon = len(meanret)                                                                      # nmon in number of months
meanret = meanret.stack(level = -1,future_stack= True).copy()                            # Stacking the dataframe to year-month index level

# Overall Portfolio Returns Statistics
global_mean = meanret.groupby('rank')['RET'].agg(["mean", "std"])                      # mean and standard deviation of regression coefficients
global_mean['t-stat'] =np.sqrt(nmon - 1) *  global_mean['mean']/global_mean['std']       # t-statistics calculation
global_mean