In [None]:
%load_ext watermark
%watermark

%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import pickle

import sys
sys.path.append('../src/ch_08')

import code_ch_08 as f_ch8

In [None]:
# load objects/X_dataset.pkl
with open('../objects/X_dataset.pkl', 'rb') as f:
    X = pickle.load(f)
    
with open('../objects/stacked_data.pkl', 'rb') as f:
    stacked_data = pickle.load(f)
    
X

In [None]:
path = '../raw_data/factor_char_list.csv'
features = pd.read_csv(path)
features_list = features.values.ravel().tolist()
features_list

In [None]:
# Investigate NAs in X

# NAs per column
nas = X.isna().sum()
nas.sort_values(ascending=False, inplace=True)
nas

In [None]:
# Group by year, and count NAs
# X does not have 'year', so we use 'year' from stacked_data
X['year'] = stacked_data['year']
nas_year = X.groupby('year').apply(lambda x: x.isna().sum())
nas_year

In [None]:
# check per stock
X['ticker'] = stacked_data['stock_ticker']
nas_stock = X.groupby('ticker').apply(lambda x: x.isna().sum())
nas_stock

In [None]:
# For each feature, count the number of stoks (rows) with NAs
nas_stock_count=(nas_stock>0).sum().sort_values(ascending=False)
nas_stock_count

In [None]:
nas_feature_stock_count = (nas_stock_count>0).sum()
nas_feature_stock_count

In [None]:
# Group by year and ticker, then count the number of NAs (missing values) for each feature
nas_feature_year_stock = X.groupby(['year', 'ticker']).apply(lambda group: group.isna().sum()>0)

# Display the result
print(nas_feature_year_stock)


In [None]:
# Group by the 'year' index level and sum the boolean values
nas_feature_year = nas_feature_year_stock.groupby(level='year').sum()

# Display the result
print(nas_feature_year)

#### Remove NAs from X_dataset and run feature importance scripts

In [None]:
X_clean = X.copy()
X_clean.head()

In [None]:
print("Record count BEFORE dropping NaN records: ", len(X_clean))
X_clean.dropna(inplace=True)
X_clean.drop(columns=['ticker'], inplace=True)
print("Record count AFTER dropping NaN records: ", len(X_clean))

In [None]:
# SNIPPET 8.5 COMPUTATION OF ORTHOGONAL FEATURES (Modified for Variance and Loadings)
def get_eVec(dot, varThres):
    # Compute eigenvalues (eVal) and eigenvectors (eVec) from dot product matrix
    eVal, eVec = np.linalg.eigh(dot)
    
    # Sort eigenvalues and eigenvectors in descending order
    idx = eVal.argsort()[::-1]  # Sort eigenvalues in descending order
    eVal, eVec = eVal[idx], eVec[:, idx]
    
    # Keep only positive eigenvalues
    eVal = pd.Series(eVal, index=["PC_" + str(i + 1) for i in range(eVal.shape[0])])
    eVec = pd.DataFrame(eVec, index=dot.index, columns=eVal.index)
    
    # Compute cumulative variance explained
    cumVar = eVal.cumsum() / eVal.sum()
    
    # Select the number of principal components that explain at least varThres variance
    dim = cumVar.values.searchsorted(varThres)
    
    # Keep only the selected principal components
    eVal, eVec = eVal.iloc[: dim + 1], eVec.iloc[:, : dim + 1]
    
    # Return eigenvalues (variance explained) and eigenvectors (loadings)
    return eVal, eVec, cumVar.iloc[: dim + 1]


# Function to standardize features and compute orthogonal features (PCA)
def orthoFeats(dfX, varThres=0.95):
    # Standardize the feature matrix
    dfZ = dfX.sub(dfX.mean(), axis=1).div(dfX.std(), axis=1)
    
    # Compute the dot product (covariance matrix)
    dot = pd.DataFrame(np.dot(dfZ.T, dfZ), index=dfX.columns, columns=dfX.columns)
    
    # Get eigenvalues (variance explained) and eigenvectors (loadings)
    eVal, eVec, cumVar = get_eVec(dot, varThres)
    
    # Transform the original features into the new principal components
    dfP = np.dot(dfZ, eVec)
    
    return dfP, eVal, eVec, cumVar


# Apply the function to your dataset
X_pca, eigenvalues, loadings, cumulative_variance = orthoFeats(X_clean, varThres=0.95)

# Convert PCA-transformed data into a DataFrame with appropriate column names
X_pca = pd.DataFrame(X_pca, index=X_clean.index)
X_pca.columns = ["pca_%d" % i for i in range(X_pca.shape[1])]

# Print the variance explained by each principal component (eigenvalues)
print("Variance Explained (Eigenvalues):")
print(eigenvalues)

# Print the cumulative variance explained
print("\nCumulative Variance Explained:")
print(cumulative_variance)

# Print the loadings (eigenvectors)
print("\nLoadings (Eigenvectors):")
print(loadings)

In [None]:
# Now X_pca contains the PCA-transformed features, eigenvalues contain variance explained,
# and loadings give the contribution of each original feature to each principal component.

variance_explained = eigenvalues / eigenvalues.sum()


# Function to get top important features based on variance explained and loadings
def get_top_features(variance_explained, loadings, top_n=20):
    """
    Rank features by their importance using the variance explained by each principal component 
    and the absolute value of the feature's loadings.
    
    Arguments:
    - variance_explained: Series, variance explained by each principal component.
    - loadings: DataFrame, loadings (eigenvectors) where columns are principal components and rows are features.
    - top_n: Number of top features to return.
    
    Returns:
    - ranked_features: DataFrame with features ranked by importance.
    """
    # Ensure the absolute values of the loadings are used
    abs_loadings = loadings.abs()
    
    # Multiply each feature's loading by the variance explained of the respective principal component
    feature_importance = abs_loadings.mul(variance_explained, axis=1)
    
    # Sum the weighted contributions across all principal components for each feature
    feature_importance['total_importance'] = feature_importance.sum(axis=1)
    
    # Sort features by their total importance in descending order
    ranked_features = feature_importance[['total_importance']].sort_values(by='total_importance', ascending=False)
    
    # Return the top N important features
    return ranked_features.head(top_n)

# Apply the function to your data
top_20_features = get_top_features(variance_explained, loadings, top_n=20)

# Print the top 20 important features
print("Top 20 Important Features:")
print(top_20_features)

In [None]:
X_pca = f_ch8.orthoFeats(X_clean)
X_pca = pd.DataFrame(X_pca, index=X_clean.index)
# name each column "pca_i" where i is the index of the column
X_pca.columns = ["pca_%d" % i for i in range(X_pca.shape[1])]
X_pca

In [None]:
# # Calculate explained variance (if applicable)
# from sklearn.decomposition import PCA
# pca = PCA()
# pca.fit(X_clean)
# explained_variance = pca.explained_variance_ratio_

# # Display the explained variance by each principal component
# print("Explained Variance by Principal Component:")
# for i, var in enumerate(explained_variance, 1):
#     print(f"PC{i}: {var:.4f}")

# # Loadings: how much each feature contributes to each principal component
# pca_loadings = pd.DataFrame(pca.components_.T, index=X_clean.columns, columns=[f'PC{i+1}' for i in range(len(X_clean.columns))])

# print("PCA Loadings (Feature Contributions to Components):")
# print(pca_loadings)

# # Visualize the loadings with a heatmap
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# sns.heatmap(pca_loadings, annot=True, cmap="coolwarm", center=0)
# plt.title("PCA Loadings Heatmap")
# plt.show()

In [None]:
# Get the indices that were dropped from X_clean
dropped_indices = X.index.difference(X_clean.index)
cont = pd.concat([stacked_data['datetime'], stacked_data['target'], stacked_data['weight_attr']], axis=1, ignore_index=True)
cont.rename(columns={cont.columns[0]: 't1', cont.columns[1]: 'bin', cont.columns[2]: 'w'}, inplace=True)
cont

In [None]:
cont = cont.drop(dropped_indices)
cont

In [None]:
# t1 = pd.Series()
tmp = cont['t1'].shift(-1).dropna()
tmp = pd.to_datetime(tmp)
# last date
result = tmp.iloc[-1] + pd.DateOffset(days=5) + pd.tseries.offsets.BMonthEnd(1)
tmp = pd.concat([tmp, pd.Series([result])], ignore_index=True)
# t1 = tmp
# index as first business day of the following month
# t1.index = pd.to_datetime(datetime) + pd.DateOffset(days=5) - pd.tseries.offsets.BMonthBegin(1)
cont.index = pd.to_datetime(cont['t1']) - pd.DateOffset(days=40) + pd.tseries.offsets.BMonthEnd(1)
cont

In [None]:
X_clean['datetime'] = cont.index

In [None]:
X_clean.set_index('datetime', inplace=True)

In [None]:
cont['w'] *= cont.shape[0]/cont['w'].sum()
cont

In [None]:
cont.isna().any(axis=1).sum()

In [None]:
# Libraries
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Bagging classifier on RF where max_samples is set to average uniqueness
clf2 = RandomForestClassifier(
    n_estimators=1,  # 1 tree
    criterion="entropy",  # information gain
    bootstrap=False,  # no bootstrap
    class_weight="balanced_subsample"  # prevent minority class from being ignored
)

clf2 = BaggingClassifier(
    estimator=clf2,  # base estimator
    n_estimators=1_000,  # 1_000 trees
    max_samples=0.94,  # average uniqueness
    max_features=1.0  # all features for bagging
)

In [None]:
methods = ['MDI', 'MDA', 'SFI']

n_estimators = 1000  # Number of trees in the random forest
cv = 10  # Number of cross-validation folds
max_samples = 1.0  # Use the entire dataset for each tree
numThreads = 1  # Adjust based on your available computational resources
pctEmbargo = 0  # No embargo for simplicity

for method in methods:
    print(f"Running feature importance for {method}...")
    imp, oob, oos = f_ch8.featImportance(pd.DataFrame(X_clean), cont, n_estimators=n_estimators, cv=cv,
                                    max_samples=max_samples, numThreads=numThreads, 
                                    pctEmbargo=pctEmbargo, method=method)
    
    # Plot the feature importance using the provided function
    f_ch8.plotFeatImportance(pathOut='./', imp=imp, oob=oob, oos=oos, method=method, tag='test', simNum=0)