In [2]:
import numpy as np
from numpy import arange
import pandas as pd

pd.set_option('display.max_columns',999)
pd.set_option('display.max_rows',999)
pd.set_option('float.format','{:f}'.format)
pd.options.display.float_format = '{:.4f}'.format

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import skew
import math 
from scipy import stats

from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

from sklearn import metrics
# !pip install threadpoolctl==3.1.0
from sklearn.cluster import KMeans


from datetime import datetime, timedelta


In [3]:
legpos = 'center left'
size = 'medium'
loc=(1,0.5)
%matplotlib inline

sns.set()

def visualization(df, x, y, figsize=(12,3), hue=None, scatter=False, dist=False, cust_col='Set2', title='' ,xlabel='', ylabel='', rotation_angel=90):
    import warnings
    warnings.filterwarnings('ignore', message="Ignoring `palette` because no `hue` variable has been assigned.")
    
    fig = plt.figure(figsize=figsize)
    if (scatter):
        ax = sns.scatterplot(x=x,y=y,data=df,hue=hue,palette=cust_col)

        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        ax.set(xlabel=xlabel, ylabel=ylabel)
        if (hue != None):
            plt.legend(loc=legpos,bbox_to_anchor=loc,fontsize=size)
            
#     elif (dist):
#         ax = sns.displot(x=x,y=y,data=df,hue=hue,palette=cust_col)

#         plt.title(title)
#         plt.xticks(rotation=rotation_angel)
#         ax.set(xlabel=xlabel, ylabel=ylabel)

    elif (hue != None):
        ax = sns.lineplot(x=x,y=y,data=df,hue=hue,palette=cust_col)

        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        plt.legend(loc=legpos,bbox_to_anchor=loc,fontsize=size)
        ax.set(xlabel=xlabel, ylabel=ylabel)
    else:
        ax = sns.lineplot(x=x,y=y,data=df,palette=cust_col)

        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        ax.set(xlabel=xlabel, ylabel=ylabel)

    plt.show()
    
def visualization_3d(df, x, y, z,figsize=(8,8), title='' ,xlabel='', ylabel='', zlabel=''):
    fig = plt.figure(figsize=figsize)
    ax = plt.axes(projection='3d')
    ax.scatter3D(df[x], df[y], df[z], cmap='Greens')
    plt.title(title)
    ax.set(xlabel=xlabel, ylabel=ylabel,zlabel=zlabel)
    plt.show()
    
def display_features(df,feature='cap',scatter=False):
    display(feature + ' start date: '+str(df.index.min()))
    display(feature + ' end date: '+str(df.index.max()))
    tmp = df.notnull().sum(axis=1).to_frame().rename(columns={0:'cnt'})
    if not scatter:
        visualization(df=None, x=tmp.index,y=tmp.cnt,title='Number of available securities - '+feature, rotation_angel=0)
    else:
        visualization(df=None, x=tmp.index,y=tmp.cnt,title='Number of available securities - '+feature, scatter=True, rotation_angel=0)
    

In [None]:
def read_mktcap(file_dir='./data/MARKET_CAP.pickle', date_col='date',col_name='ticker_id'):
    df = pd.read_pickle(file_dir)
    df = pd.DataFrame(df)
    df.index= pd.to_datetime(df.index).tz_localize(None)
    df.index.name = date_col
    df.columns.name = col_name
    df.columns = df.columns.astype(str)
    return df

def read_rtn(file_dir, date_col='date',col_name='ticker_id'):
    df = pd.read_pickle(file_dir)
    df = pd.DataFrame(df).reset_index(level='start', drop=True)
    df.index= pd.to_datetime(df.index).tz_localize(None)
    df.index.name = date_col
    df.columns.name = col_name
    df.columns = df.columns.astype(str)
    return df

def read_profit(file_dir,col, date_col='date',idx_col='ticker_id',val='profit'):
    df = pd.read_pickle(file_dir)
    df = pd.DataFrame(df).reset_index()
    df.columns = col
    
    df[date_col] = pd.to_datetime(df[date_col].dt.date)
    df[idx_col] = df[idx_col].astype(str)
    
    df.loc[:,'quarter'] = df[date_col].dt.year.astype(str)+'-Q'+df[date_col].dt.quarter.astype(str)
    df.loc[:,'month'] = df[date_col].dt.year.astype(str)+'-'+df[date_col].dt.month.astype(str)
    
    return df

def read_sector(file_dir, col, date_col='date',idx_col='ticker_id'):
    
    df = pd.read_pickle(file_dir)
    df = pd.DataFrame(df).reset_index()
    df.columns = col
    df[idx_col] = df[idx_col].astype(str)

    return df

def read_macro(file_dir, col, date_col='date'):
    
    df = pd.read_pickle(file_dir)
    df = pd.DataFrame(df).reset_index()
    df.columns = col
    df[date_col] = pd.to_datetime(df[date_col].dt.date)

    return df

def missing_values(df):
    tmp = df.isnull().sum().to_frame().T
    display(tmp)
    missing_ticker = tmp.loc[0][tmp.loc[0] != 0].index.tolist()
    return missing_ticker


In [None]:
def universe_selection(df, low_thred=100, high_thred=900):
    
    rst = pd.DataFrame(index=df.index, columns=df.columns)
    tmp = df.copy()

    for row in tmp.index:
        univ = tmp.loc[row,:].sort_values(ascending=False).iloc[low_thred:high_thred].index.to_list()
        rst.loc[row,univ] = 1
    return rst

def whether_in_universe(df):
    isin_univ = df.copy()
    isin_univ = isin_univ.fillna(0)
    isin_univ[isin_univ != 0] = 1.0
    isin_univ = isin_univ.astype('int')
    return isin_univ


In [None]:
def get_macro_features(df, start_date, clusters=range(3,6)):
    df = df.copy()
    df = df.loc[start_date:].fillna(method='ffill')
    
    ### insample train 
    for cluster in clusters:
        kmeans = KMeans(n_clusters=cluster, random_state=0, n_init="auto").fit(df)
        df.loc[:,'macro_group'+str(cluster)] = kmeans.labels_  #kmeans.predict(macro)
    
    return df

def cluster_viz(df, n_cluster=3, col1='NONFARMPAYROLL',col2='UnemploymentRate',n1=3,n2=4):
    df = df.copy()
    model = KMeans(n_clusters=n_cluster, random_state=0, n_init="auto")
    model.fit(df)
    labels = model.predict(df)

    xs = df.loc[:,col1]
    ys = df.loc[:,col2]
    plt.figure(figsize=(6, 4))
    plt.scatter(xs,ys,c=labels,alpha=0.5, cmap='jet')

    centroids = model.cluster_centers_
    centroids_x = centroids[:,n1]
    centroids_y = centroids[:,n2]
    
    plt.scatter(centroids_x,centroids_y,marker='D',s=50)
    plt.title(f'{n_cluster} clusters distribution')
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.show()
    
    
def elblow_check(df):
    ks = range(1, 6)
    inertias = []
    for k in ks:
        model = KMeans(n_clusters=k, random_state=0, n_init="auto")
        model.fit(df)
        inertias.append(model.inertia_)

    # Plot ks vs inertias
    plt.figure(figsize=(6, 3))
    plt.plot(ks, inertias, '-o')
    plt.title('Elbow method looks for the best number of clusters')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()
    
    
def merge_macro_features(rtn, macro_df,date_col = 'date',idx_col = 'ticker_id'):
    rtn = rtn.copy()
    fwd_rtn = rtn.shift(-1)
    fwd_rtn = fwd_rtn.reset_index().melt(id_vars=date_col).rename(columns={'value':'fwd_rtn'})
    fwd_rtn.loc[:,'month'] = fwd_rtn[date_col].dt.year.astype(str)+'-'+fwd_rtn[date_col].dt.month.astype(str)
 
    ##### merge monthly macro features 
    macro_tmp = macro_df.reset_index()
    macro_tmp.loc[:,'month'] = macro_tmp[date_col].dt.year.astype(str)+'-'+macro_tmp[date_col].dt.month.astype(str)
    
    del macro_tmp['date']
    rst = pd.merge(fwd_rtn, macro_tmp, on=['month'], how='left')
    del rst['month']
    
    rst = rst.set_index([date_col,idx_col])
    
    return rst

def get_profit_features(df, col='month',date_col='date',idx_col='ticker_id',val='profit'):
    df = df.copy()
    df.loc[:,date_col] = pd.to_datetime(df[col])

    df = df.sort_values([date_col, idx_col])

    ##### cross-sectional: rank all stocks 
    df['rank'] = df.groupby(date_col)[val].rank(method='dense', ascending=False).astype(int)

    ##### ts trend - quarterly change 
    def profit_trend(group,n=3):
        # Shift the score by 3 months as a quarter
        group['profit_trend'] = (group['profit'] > group['profit'].shift(n)).astype(int)
        return group
    
    df = df.groupby(idx_col).apply(profit_trend).reset_index(drop=True)
    return df

def merge_monthly_features(index_df, macro_df, profit_df,date_col = 'date',idx_col = 'ticker_id'):
    multi_index = index_df[[]].reset_index()
    multi_index.loc[:,'month'] = multi_index[date_col].dt.year.astype(str)+'-'+multi_index[date_col].dt.month.astype(str)

    ##### merge monthly profit features
    rst = pd.merge(multi_index, profit_df[['month','ticker_id','profit','rank','profit_trend']], on=['month','ticker_id'], how='left')
    
    ##### merge monthly macro features 
    macro_tmp = macro_df.reset_index()
    macro_tmp.loc[:,'month'] = macro_tmp[date_col].dt.year.astype(str)+'-'+macro_tmp[date_col].dt.month.astype(str)
    del macro_tmp['date']
    rst = pd.merge(rst, macro_tmp, on=['month'], how='left')
    del rst['month']
    
    rst = rst.set_index([date_col,idx_col])
    
    return rst

def get_daily_rawinput(date_col = 'date',idx_col = 'ticker_id'):
    daily_rtn = rtn.reset_index().melt(id_vars=date_col).set_index([date_col,idx_col]).rename(columns={'value':'daily_rtn'})
    fwd_rtn = rtn.shift(-1).reset_index().melt(id_vars=date_col).set_index([date_col,idx_col]).rename(columns={'value':'fwd_rtn'})
    mktcap_log = np.log(mktcap).reset_index().melt(id_vars=date_col).set_index([date_col,idx_col]).rename(columns={'value':'mktcap_log'})
    daily_mktcap = mktcap.reset_index().melt(id_vars=date_col).set_index([date_col,idx_col]).rename(columns={'value':'mktcap'})

    features = fwd_rtn.merge(daily_rtn,left_index=True, right_index=True, how='left')
    features = features.merge(mktcap_log,left_index=True, right_index=True, how='left')
    features = features.merge(daily_mktcap,left_index=True, right_index=True, how='left')
    features = pd.merge(features.reset_index(), sector, on=idx_col, how='left').set_index([date_col,idx_col])

    monthly_features = merge_monthly_features(index_df=features, macro_df=macro_features, profit_df=profit_features)
    features = features.merge(monthly_features,left_index=True, right_index=True, how='left')

    return features


def fill_missing_values(df,col_list):
    ##### get sector avg
    sector_avg = df.reset_index().groupby(['date','sector'])[col_list].mean()
    
    ### rename column 
    for col in col_list:
        sector_avg = sector_avg.rename(columns={col:col+'_sector'})
        
    ##### merge industry avg 
    df = df.reset_index().merge(sector_avg.reset_index(), how='left',on=['date','sector'])
    
    ##### fill missing values 
    for col in col_list:
        df.loc[df[col].isnull(),col]=df.loc[df[col].isnull(),col+'_sector']
    
    df = df[['date','ticker_id','sector']+col_list].set_index(['date','ticker_id']).sort_index()
    return df

def plot_correlation_chart(df, target_factor, angel=25, plot=True):
    
    correlation_matrix = df.corr()
    target_correlation = correlation_matrix[target_factor].drop(target_factor, errors='ignore')
    if plot:
    # Plotting
        plt.figure(figsize=(10, 3))
        target_correlation.plot(kind='bar')
        plt.title(f'Correlation with {target_factor}')
        plt.ylabel('Correlation Coefficient')
        plt.xlabel('Factors')
        plt.xticks(rotation=angel)
        plt.show()
    return target_correlation.to_frame().T

In [None]:
def zscore(x, window):
    r = x.rolling(window=window, min_periods=5)
    m = r.mean()
    s = r.std()
    z = (x-m)/s
    return z

def calculate_close_price(df):
    ##### relative close price, Pn/P0
    rst = np.cumprod(1+df)
    return rst


In [None]:
def get_signals(rtn, features_all,date_col = 'date',idx_col = 'ticker_id'):
    
    ######## return 
    close_rtn = rtn.copy()
    fwd_rtn = rtn.shift(-1)
    
    ##### relative price 
    close_price = calculate_close_price(df=rtn)
    
    ##### return ts mean 
    rtn_1w = close_rtn.rolling(window=5).mean()
    rtn_1m = close_rtn.rolling(window=20).mean()
    rtn_6m = close_rtn.rolling(window=120).mean()
    rtn_1y = close_rtn.rolling(window=252).mean()
    
    ##### return ts zscore 
    rtn_z_1w = zscore(x=close_rtn,window=5)
    rtn_z_1m = zscore(x=close_rtn,window=20)
    rtn_z_6m = zscore(x=close_rtn,window=120)
    rtn_z_1y = zscore(x=close_rtn,window=252)
    
    ##### return ts skew 
    rtn_skew_1w = close_rtn.rolling(window=5).skew()
    rtn_skew_1m = close_rtn.rolling(window=20).skew()
    rtn_skew_6m = close_rtn.rolling(window=120).skew()
    rtn_skew_1y = close_rtn.rolling(window=252).skew()

    ##### return ts diff 
    # rtn_diff_1w = ts_diff(dt=close_rtn, lookback1=5,lookback2=10)
    # rtn_diff_1m = ts_diff(dt=close_rtn, lookback1=20,lookback2=40)
    # rtn_diff_6m = ts_diff(dt=close_rtn, lookback1=120,lookback2=240)

    ##### up returns 
    up_rtn = close_rtn.copy()
    up_rtn[up_rtn<0] = 0
    ##### up return ts mean 
    up_rtn_1w = up_rtn.rolling(window=5).mean()
    up_rtn_1m = up_rtn.rolling(window=20).mean()
    up_rtn_6m = up_rtn.rolling(window=120).mean()
    up_rtn_1y = up_rtn.rolling(window=252).mean()

    ##### down returns 
    down_rtn = close_rtn.copy()
    down_rtn[down_rtn>0] = 0
    ##### down return ts mean 
    down_rtn_1w = down_rtn.rolling(window=5).mean()
    down_rtn_1m = down_rtn.rolling(window=20).mean()
    down_rtn_6m = down_rtn.rolling(window=120).mean()
    down_rtn_1y = down_rtn.rolling(window=252).mean()

    cap = features_all[['mktcap_log']].reset_index().pivot(index='date',columns=idx_col,values='mktcap_log')
    ##### mktcap ts zscore 
    cap_z_1w = zscore(x=cap,window=5)
    cap_z_1m = zscore(x=cap,window=20)
    cap_z_6m = zscore(x=cap,window=120)
    cap_z_1y = zscore(x=cap,window=252)
    ##### mktcap ts mean 
    cap_1w = cap.rolling(window=5, min_periods=5).mean()
    cap_1m = cap.rolling(window=20, min_periods=5).mean()
    cap_6m = cap.rolling(window=120, min_periods=5).mean()
    cap_1y = cap.rolling(window=252, min_periods=5).mean()
    
    signals = pd.DataFrame({
#         {'fwdrtn':fwd_rtn.melt()['value'].values,
#          'close_rtn':close_rtn.melt()['value'].values,
#          'profit_rank':features_all['rank'].values,
#          'profit_trend':features_all['profit_trend'].values,
#          'macro_group':features_all['macro_group'].values,
                        
        'rtn_1w':rtn_1w.melt()['value'].values,'rtn_1m':rtn_1m.melt()['value'].values,'rtn_6m':rtn_6m.melt()['value'].values,'rtn_1y':rtn_1y.melt()['value'].values,
        'rtn_z_1w':rtn_z_1w.melt()['value'].values,'rtn_z_1m':rtn_z_1m.melt()['value'].values,'rtn_z_6m':rtn_z_6m.melt()['value'].values,'rtn_z_1y':rtn_z_1y.melt()['value'].values,
       'rtn_skew_1w':rtn_skew_1w.melt()['value'].values,'rtn_skew_1m':rtn_skew_1m.melt()['value'].values,'rtn_skew_6m':rtn_skew_6m.melt()['value'].values,'rtn_skew_1y':rtn_skew_1y.melt()['value'].values,
       'up_rtn_1w':up_rtn_1w.melt()['value'].values,'up_rtn_1m':up_rtn_1m.melt()['value'].values,'up_rtn_6m':up_rtn_6m.melt()['value'].values,'up_rtn_1y':up_rtn_1y.melt()['value'].values,
       'down_rtn_1w':down_rtn_1w.melt()['value'].values,'down_rtn_1m':down_rtn_1m.melt()['value'].values,'down_rtn_6m':down_rtn_6m.melt()['value'].values,'down_rtn_1y':down_rtn_1y.melt()['value'].values,
        'cap_1w':cap_1w.melt()['value'].values,'cap_1m':cap_1m.melt()['value'].values,'cap_6m':cap_6m.melt()['value'].values,'cap_1y':cap_1y.melt()['value'].values,
         'cap_z_1w':cap_z_1w.melt()['value'].values,'cap_z_1m':cap_z_1m.melt()['value'].values,'cap_z_6m':cap_z_6m.melt()['value'].values,'cap_z_1y':cap_z_1y.melt()['value'].values,
                       })

#     signals = pd.DataFrame({'fwdrtn':fwd_rtn.melt()['value'].values,
#                             'rtn_1w':rtn_1w.melt()['value'].values,'rtn_1m':rtn_1m.melt()['value'].values,'rtn_6m':rtn_6m.melt()['value'].values,'rtn_1y':rtn_1y.melt()['value'].values,
#                             'rtn_z_1w':rtn_z_1w.melt()['value'].values,'rtn_z_1m':rtn_z_1m.melt()['value'].values,'rtn_z_6m':rtn_z_6m.melt()['value'].values,'rtn_z_1y':rtn_z_1y.melt()['value'].values,
#                            'rtn_skew_1w':rtn_skew_1w.melt()['value'].values,'rtn_skew_1m':rtn_skew_1m.melt()['value'].values,'rtn_skew_6m':rtn_skew_6m.melt()['value'].values,'rtn_skew_1y':rtn_skew_1y.melt()['value'].values,
#                            'up_rtn_1w':up_rtn_1w.melt()['value'].values,'up_rtn_1m':up_rtn_1m.melt()['value'].values,'up_rtn_6m':up_rtn_6m.melt()['value'].values,'up_rtn_1y':up_rtn_1y.melt()['value'].values,
#                            'down_rtn_1w':down_rtn_1w.melt()['value'].values,'down_rtn_1m':down_rtn_1m.melt()['value'].values,'down_rtn_6m':down_rtn_6m.melt()['value'].values,'down_rtn_1y':down_rtn_1y.melt()['value'].values
#                            })

#     signals.index=rtn.index
    tmp = rtn.reset_index().melt(id_vars=date_col).set_index([date_col,idx_col])
    signals.index=tmp.index
    
    return signals

def get_time_signals(df):
    df = df.copy()
    weekday_map = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5}
    
    df.loc[:,'Week_Number'] = df.index.get_level_values('date').day_name().map(weekday_map)
    df.loc[:,'Month'] = df.index.get_level_values('date').month
    df.loc[:,'Quarter'] = df.index.get_level_values('date').quarter
    
    return df 
    
    
def PCA_factors(df):
    ##### data cleaning 
    imputer = SimpleImputer(strategy='mean')
    df = df.copy()
    df_imputed = df.copy()
    df_imputed.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_imputed = imputer.fit_transform(df_imputed)
    
    ##### PCA factors 
    pca = PCA()
    X_pca = pca.fit_transform(df_imputed)
    n_components = len(pca.explained_variance_ratio_)
    explained_variance = pca.explained_variance_ratio_
    cum_explained_variance = np.cumsum(explained_variance)
    idx = np.arange(n_components)+1
    df_explained_variance = pd.DataFrame([explained_variance, cum_explained_variance], 
                                         index=['explained variance', 'cumulative'], 
                                         columns=idx).T
    mean_explained_variance = df_explained_variance.iloc[:,0].mean() # 
    print('PCA Overview')
    print('='*40)
    print("Total: {} components".format(n_components))
    print('-'*40)
    print('Mean explained variance:', round(mean_explained_variance,3))
    print('-'*40)
    print(df_explained_variance.head(20))
    print('-'*40)
    limit = int(input("Limit scree plot to nth component (0 for all) > "))
    if limit > 0:
        limit_df = limit
    else:
        limit_df = n_components
    
    ##### explaining power 
    df_explained_variance_limited = df_explained_variance.iloc[:limit_df,:]
    #make scree plot
    fig, ax1 = plt.subplots(figsize=(15,6))
    ax1.set_title('Explained variance across principal components', fontsize=14)
    ax1.set_xlabel('Principal component', fontsize=12)
    ax1.set_ylabel('Explained variance', fontsize=12)
    ax2 = sns.barplot(x=idx[:limit_df], y='explained variance', data=df_explained_variance_limited, palette='summer')
    ax2 = ax1.twinx()
    ax2.grid(False)
    ax2.set_ylabel('Cumulative', fontsize=14)
    ax2 = sns.lineplot(x=idx[:limit_df]-1, y='cumulative', data=df_explained_variance_limited, color='#fc8d59')
    ax1.axhline(mean_explained_variance, ls='--', color='#fc8d59') #plot mean
    ax1.text(-.8, mean_explained_variance+(mean_explained_variance*.05), "average", color='#fc8d59', fontsize=14) #label y axis
    max_y1 = max(df_explained_variance_limited.iloc[:,0])
    max_y2 = max(df_explained_variance_limited.iloc[:,1])
    ax1.set(ylim=(0, max_y1+max_y1*.1))
    ax2.set(ylim=(0, max_y2+max_y2*.1))
    plt.show()
    
    loadings = pca.components_.T[:, :3]  # Only take the first 3 components

    #top 3 loadings for better readability
#     df_loadings = pd.DataFrame(loadings, index=df.columns, columns=['PC1', 'PC2', 'PC3'])
#     plt.figure(figsize=(10, 8))
#     sns.heatmap(df_loadings, annot=True, cmap='coolwarm', center=0)
#     plt.title('Heatmap of Top 3 PCA Component Loadings')
#     plt.xlabel('Principal Components')
#     plt.ylabel('Original Features')
#     plt.tight_layout()
#     plt.show()
#     ##### PC factors 
    pca = PCA(n_components=5)
    principalComponents = pca.fit_transform(df_imputed)
    principalDf = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
    for i in range(1, 6):
        df[f'PC{i}'] = principalComponents[:, i-1]

    return df

    

    


In [None]:
def get_alpha(df,signals,pos_thres=0.002, neg_thres=-0.002, col='fwdrtn',isfilter=True,idx_col='ticker_id'):
    import warnings
    from scipy.linalg import LinAlgWarning
    warnings.filterwarnings(action='ignore', category=LinAlgWarning, module='sklearn')
    
    df = df.copy()
    
    sector_dummy = pd.get_dummies(data=df['sector'],prefix='sector').astype(int)
    df = pd.concat([df,sector_dummy],axis=1).drop('sector', axis=1)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    correlation = df.corr()
    if isfilter:
        features = correlation.loc[(correlation[col] > pos_thres) | (correlation[col]<neg_thres)][[col]].sort_values(col).index.to_list()
    else:
        features = correlation.index.to_list()
    
    features.remove(col)
    
    train = df.dropna(subset=[col]+features)
    X = train[features]
    Y = train[col]
    model = Ridge(alpha=0.51)
    Ridgemodel = model.fit(X,Y)
    
    coefficients = model.coef_
    # Create a DataFrame for easier interpretation
    coef_df = pd.DataFrame(coefficients, index=X.columns, columns=['Coefficient'])
    # Sort the DataFrame by the absolute values of the coefficients in descending order
    coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
    sorted_coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False).drop('Abs_Coefficient', axis=1)

    available = df.dropna(subset=features).copy()
    
    rst = pd.DataFrame(data=Ridgemodel.predict(available[features]),index=available.index,columns=['alpha'])
    alpha = pd.DataFrame(index=signals.index,columns=['alpha'])
    tmp = alpha.merge(rst,left_index=True,right_index=True,how='left')
    alpha = tmp[['alpha_y']].reset_index().pivot(index='date',columns=idx_col,values='alpha_y')
    
    return sorted_coef_df, alpha 


In [None]:
def get_betas(df):
    df = df.copy()
    ### calculate the daily average across all tickers
    daily_avg = df.mean(axis=1)

    rolling_beta = pd.DataFrame(index=df.index, columns=df.columns)

    ### calculate the 1-year rolling beta for each stock
    for ticker in df.columns:
        tmp = pd.concat([df[ticker], daily_avg], axis=1)
        rolling_cov = tmp.rolling(window=252, min_periods=1).cov().iloc[0::2, -1]
        rolling_var = daily_avg.rolling(window=252, min_periods=1).var()

        rolling_beta.loc[:,ticker] = (rolling_cov/rolling_var).values
    return rolling_beta



def adjust_market_impact(data,univ,dt,signal,col='momentum'):
   
    ##### adjust for market impacts - beta 
    ### get beta 
    beta = get_betas(univ_brtn=data)
    
    ##### adjust for sector impacts 
    ### create dummy variables for sector
    sector = univ.loc[(univ.index.get_level_values('date')==dt)][['sector']]
    sector = pd.get_dummies(data=sector['variable'],prefix='sector').droplevel('date')
    
    ##### perform linear regression to adjust market and sector impacts 
    X = pd.concat([beta,sector], axis=1)
    adj_signal = signal - LinearRegression().fit(X, signal).predict(X)
    
    adj_signal.name = col
    # another way to adjust mkt & sector impacts can be: perform stepwise regression
    # adjust mkt impacts first 
    # then adjust for sector impacts 
    return adj_signal


In [None]:
def clean_backtestinput_format(df,start_date,end_date,isfwd=False):
    if isfwd:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df[df.abs() > 1] = np.nan
    df = df.sort_index().loc[start_date:end_date]
    return df

def Correlation(s1, s2, method='pearson'):
    corr = None
    not_nan_loc = (~np.isnan(s1)) & (~np.isnan(s2))


    if not_nan_loc.sum() < 2:
        return np.nan

    s1 = s1[not_nan_loc]
    s2 = s2[not_nan_loc]
    
    if method == 'pearson':
        corr = stats.pearsonr(s1, s2)[0]
    elif method == 'spearman':
        corr = stats.spearmanr(s1, s2)[0]
    return corr

def Regression(x, y):
    if x.shape[0] < 3:
        return np.nan, np.nan, np.nan
    
    x = x.reshape(-1, 1)
    n, k = x.shape[0], 1
    reg = LinearRegression().fit(x, y)
    y_hat = reg.predict(x)
    residual = y - y_hat
    coef = reg.coef_
    intercept = reg.intercept_

    sigma_hat = sum(residual ** 2) / (n - k - 1)  # estimate of error term variance
    variance_beta_hat = sigma_hat * np.linalg.inv(np.matmul(x.transpose(), x))
    t_stat = coef / np.sqrt(variance_beta_hat.diagonal())
    return t_stat, coef, intercept

class SingleFactorAnalysis:

    def __init__(self, forward_ret, alpha_df, tradable_df, freq):

        self.alpha_df = alpha_df.copy()
        self.alpha_np = self.alpha_df.values

        self.tradable_df = tradable_df
        self.tradable_np = self.tradable_df.values

        self.alpha_df_tradable = alpha_df.copy()
        self.alpha_df_tradable[self.tradable_df == 0] = 0
        self.alpha_np_tradable = self.alpha_df_tradable.values

        self.fwd_rtn = forward_ret
        self.fwd_rtn_np = self.fwd_rtn.values

        self.fwd_rtn_norm = self.fwd_rtn.subtract(self.fwd_rtn.mean(axis=1), axis=0)
        self.fwd_rtn_norm_np = self.fwd_rtn_norm.values

        self.dates_len = self.alpha_np.shape[0]
        self.dates_index = self.alpha_df.index
        self.sname = self.alpha_df.columns

        self.points_per_year = freq
        self.turnover = 0 

        # group
        self.group_num = 5
        self.group_ptf_rtn_np = np.zeros((self.dates_len, self.group_num))
        self.group_ptf_rtn_df = None

        # IC
        self.IC_np = np.zeros(self.dates_len)
        self.IC_series = None

        # regression
        self.tstats_np = np.zeros(self.dates_len)
        self.tstats_series = None

        self.factor_ret_np = np.zeros(self.dates_len)
        self.factor_ret_series = None

        self.factor_alpha_np = np.zeros(self.dates_len)
        self.factor_alpha_series = None

        self.performance = {}

        
    def Statistics(self):

        for i in range(self.dates_len-1):

            tradable_loc = self.tradable_np[i, :] == 1

            # alpha calculation
            alpha_currentdate = self.alpha_np[i, tradable_loc]

            fwd_rtn_currentdate = self.fwd_rtn_np[i, tradable_loc]
            fwd_rtn_norm_currentdate = self.fwd_rtn_norm_np[i, tradable_loc]

            # group portfolios
            tradable_num = len(alpha_currentdate)
            num_per_group = int(tradable_num/self.group_num)
            ind = np.argsort(alpha_currentdate)  # ascending
            
            for j in range(self.group_num):
                ind_this_group = ind[j*num_per_group:(j+1)*num_per_group]
                fwd_rtn_group = fwd_rtn_currentdate[ind_this_group]
                self.group_ptf_rtn_np[i, j] = fwd_rtn_group[~np.isnan(fwd_rtn_group)].mean()

            not_nan_loc = (~np.isnan(alpha_currentdate)) & (~np.isnan(fwd_rtn_currentdate))

            if (~not_nan_loc).all():
                # all alpha/fwd_rtn are nan, skip today
                continue

            # IC method
            self.IC_np[i] = Correlation(alpha_currentdate[not_nan_loc], fwd_rtn_currentdate[not_nan_loc], method='spearman')

            # t stats
            tstats, factor_return, factor_alpha = Regression(alpha_currentdate[not_nan_loc], fwd_rtn_currentdate[not_nan_loc])
            self.tstats_np[i] = tstats
            self.factor_ret_np[i] = factor_return
            self.factor_alpha_np[i] = factor_alpha

        # factor weights - alpha weighted pnl to make sure the daily GMV is constant. 
        weight_df = self.alpha_df_tradable.div(self.alpha_df_tradable.abs().sum(axis=1), axis=0).fillna(0)
        
        # turnover 
        self.turnover = weight_df.diff().abs().sum(axis=1)/weight_df.abs().sum(axis=1)
        self.turnover.replace([np.inf, -np.inf], np.nan, inplace=True)
        self.turnover = self.turnover.fillna(0)

        # factor portfolio returns
        alpha_returns_df = weight_df * self.fwd_rtn
        self.ptf_returns = alpha_returns_df.sum(axis=1)

        self.group_ptf_rtn_df = pd.DataFrame(self.group_ptf_rtn_np,
                                             index=self.dates_index,
                                             columns=['group ' + str(i) for i in range(self.group_num, 0, -1)])
        
        self.IC_series = pd.Series(self.IC_np, index=self.dates_index).fillna(0)
        self.IC_cum_series = self.IC_series.cumsum()
        self.tstats_series = pd.Series(self.tstats_np, index=self.dates_index).fillna(0)
        self.factor_ret_series = pd.Series(self.factor_ret_np, index=self.dates_index).fillna(0)
        self.factor_alpha_series = pd.Series(self.factor_alpha_np, index=self.dates_index).fillna(0)

        self.performance['IC mean'] = self.IC_series.mean()
        self.performance['IC std'] = self.IC_series.std()
        self.performance['ICIR'] = self.IC_series.mean()/self.IC_series.std() * np.sqrt(self.points_per_year)
        self.performance['t-stats mean'] = self.tstats_series.mean()
        self.performance['Factor Portfolio Return'] = self.ptf_returns.mean() * self.points_per_year
        self.performance['Factor Portfolio Sharpe Ratio'] = self.ptf_returns.mean() / self.ptf_returns.std() * np.sqrt(self.points_per_year)
        self.performance['Turnover'] = self.turnover.mean()
        
    def PlotResult(self):
        self._CumulativeAlphaReturns()
        self._Drawdown()
        self._Turnover()
        self._GroupPortfolios()
        self._IC()
        self._AlphaDecay()
        plt.show()
        
        self.performance = {key: f"{value:.3f}" for key, value in self.performance.items()}
        print(self.performance)
    
    def _Drawdown(self):
        cumrtn = self.ptf_returns.cumsum()
        nav = cumrtn+1
        max_nav = nav.apply(lambda x: max(1,x)).cummax()

        # Calculate drawdown
        drawdown = nav - max_nav

        # Calculate drawdown percentage
        drawdown_percentage = 100*drawdown / max_nav
        
        plt.figure(figsize=(12, 3))
        plt.plot(drawdown_percentage)
        plt.title('Portfolio Drawdown')
        
    def _Turnover(self):
        
        plt.figure(figsize=(12, 3))
        plt.plot(self.turnover)
        plt.title('Portfolio Turnover')

    def _CumulativeAlphaReturns(self):

        self.performance['AnnualReturn'] = self.ptf_returns.mean() * self.points_per_year

        plt.figure(figsize=(12, 3))
        plt.plot(self.ptf_returns.cumsum())
        plt.title('Factor Weighted Long/Short Portfolio Cumulative Return')

    def _GroupPortfolios(self):

        plt.figure(figsize=(12, 3))
        plt.plot(self.group_ptf_rtn_df.cumsum())
        plt.legend(self.group_ptf_rtn_df.columns)
        plt.title('Cumulative Return by Quantile')

        plt.figure(figsize=(12, 3))
        plt.plot((self.group_ptf_rtn_df.iloc[:, -1] - self.group_ptf_rtn_df.iloc[:, 0]).cumsum())
        plt.title('Top Minus Bottom Quantile Cumulative Return')

        ptf_rtn = pd.DataFrame()
        ptf_rtn['Group bottom quantile'] = self.group_ptf_rtn_df['group 5']
        ptf_rtn['Group top quantile'] = self.group_ptf_rtn_df['group 1']

        # plt.figure(figsize=(16, 9))
        # plt.plot(ptf_rtn.cumsum())
        # plt.legend(ptf_rtn.columns)
        # plt.title('')

    def _IC(self):

        plt.figure(figsize=(12, 3))
        plt.bar(x=self.IC_series.index, height=self.IC_series.values)
        plt.plot(self.IC_series.rolling(window=5).mean())
        plt.title('IC mean')

        plt.figure(figsize=(12, 3))
        plt.plot(self.IC_cum_series.values)
        plt.title('IC Cumulative Sum')

    def _AlphaDecay(self):
        alpha_decay_np = np.zeros((self.dates_len, 10))
        decay_period = range(alpha_decay_np.shape[1])

        for i in range(self.dates_len-decay_period[-1]):
            for j in decay_period:
                alpha_decay_np[i, j] = Correlation(self.alpha_np[i, :], self.alpha_np[i+j, :])

        alpha_decay_df = pd.DataFrame(alpha_decay_np,
                                      index=self.dates_index,
                                      columns=decay_period).mean(axis=0)

        plt.figure(figsize=(12, 3))
        plt.plot(alpha_decay_df)
        plt.title('Alpha Decay')

        
        
