In [4]:
import pandas as pd
import numpy as np
from scipy import stats

pd.set_option('display.max_columns',999)
pd.set_option('display.max_rows',999)
pd.set_option('float.format','{:f}'.format)

# pd.set_option('precision', 2)
pd.options.display.float_format = "{:,.2f}".format

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
legpos = 'center left'
size = 'medium'
loc=(1,0.5)
%matplotlib inline

sns.set()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


import statsmodels.api as sm


In [None]:
def visualization(df, x, y, figsize=(12,3), hue=None, scatter=False, cust_col='Set2', title='' ,xlabel='', ylabel='', rotation_angel=90):
    df = df.copy()
    
    fig = plt.figure(figsize=figsize)
    if (scatter):
        ax = sns.scatterplot(x=x,y=y,data=df,palette=cust_col, ci=None)
        
        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        ax.set(xlabel=xlabel, ylabel=ylabel)
        
    elif (hue != None):
        ax = sns.lineplot(x=x,y=y,data=df,hue=hue,palette=cust_col, ci=None)
        
        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        plt.legend(loc=legpos,bbox_to_anchor=loc,fontsize=size)
        ax.set(xlabel=xlabel, ylabel=ylabel)
    else:
        ax = sns.lineplot(x=x,y=y,data=df,palette=cust_col, ci=None)
        
        plt.title(title)
        plt.xticks(rotation=rotation_angel)
        ax.set(xlabel=xlabel, ylabel=ylabel)
    
    plt.show()

In [None]:
def get_regression_stats(df,y,x):
    X = sm.add_constant(df[x])
    y = df[y]
    model = sm.OLS(y,X)
    res = model.fit()
    stats = {'alpha_mean':res.resid.mean(), 'alpha_vol':res.resid.std(),'beta_coef':res.params[x],
             'beta_pvalue':res.pvalues[x],'beta_tstats':res.tvalues[x],'r_square':res.rsquared}
    return stats, res

def get_rolling_regression_stats(df,y,x,window):
    df = df.copy()
    df = df.reset_index(drop=True)
    cols = ['date','r_square','beta_coef','beta_tstats','beta_pvalue','alpha_mean','alpha_vol']
    a = np.zeros(shape=(len(df),len(cols)))
    stats = pd.DataFrame(a,columns = cols)
    stats.date = df.date
    
    for idx in range(len(df)-window):
        df_temp = df.iloc[idx:idx+window]
        currentdate = df_temp.date.iloc[-1]

        stats_temp,res = get_regression_stats(df_temp,y,x)
        
        for key,value in stats_temp.items():
            stats.loc[stats.date==currentdate,key] = value
    
    return stats


In [3]:
def get_strat_rtn(df):
    df = df.copy()
    df.loc[:,'rtn'] = np.where(df['r_open'] > 0, df['r_close'], -df['r_close'])
    return df
    

In [None]:
def backtesting_metrics(df, rtn, index):
    df = df.copy()
    metric = pd.DataFrame(columns=['rtn','std','Sharpe','Successrate'])
    annualized_rtn = df[rtn].mean()*252/len(df)
    annualized_std = df[rtn].std()*(252/len(df))**0.5
    sharpe = annualized_rtn/annualized_std
    successrate = 100*((df[rtn] > 0).sum()/len(df))
    metric.loc[index,:] = [annualized_rtn, annualized_std, sharpe, successrate]
    return metric
    

In [None]:
def data_coverage_check(df,indiceslist,timeused=['1000','1530','close'],qualitycheckcol = ['PctMissing','FirstAvailableDate','PctMissingUpd']):
    df = df.copy()
    dataquality = pd.DataFrame(columns=qualitycheckcol)
    
    for index in indiceslist:
        ##### select index speficic data 
        tmp = pd.DataFrame(data=df.loc[:, (timeused,index)].values, columns=timeused)
        tmp.loc[:,'date'] = df.index
        tmp.loc[:,'open'] = tmp.close.shift(1)
        tmp = tmp[['date','open']+timeused]

        #### select four timestamps we need which have available price 
        withoutmissing = tmp.loc[tmp[timeused].notnull().all(1)].reset_index(drop=True)

        if len(withoutmissing) == 0: 
            print(index + " don't have available price data that we need. To be excluded from our analysis.")
            continue

        pctmissing = 100*(1-len(withoutmissing)/len(tmp)) 
        #### identify when is the first day with available data for all four timestamps we need 
        firstdate = withoutmissing.date[0]

        #### slice price data after the first day with available data for all four timestamps we need 
        withoutmissingupd = tmp.loc[tmp.date >= firstdate]
        pctmissingupd = 100*(1-withoutmissingupd.describe().loc['count',:].min()/len(withoutmissingupd))

        dataquality.loc[index,:]=[round(pctmissing,2),firstdate,round(pctmissingupd,2)]
        
    return dataquality

    

In [2]:
def get_rtn_data(df):
    df = df.copy()
    df.loc[:,'r_open'] = (df['1000']-df['open'])/df['open']
    df.loc[:,'r_close'] = (df['close']-df['1530'])/df['1530']
    return df
    

In [None]:
def price_outlier_analysis(df, index,timeused=['1000','1530','close']):
    df = df.copy()
    ##### select ETF data 
    ETF = pd.DataFrame(data=df.loc[:, (timeused,index)].values, columns=timeused)
    ETF.loc[:,'date'] = df.index
    ETF.loc[:,'open'] = ETF.close.shift(1)
    ETF = ETF[['date','open']+timeused]
    ETF = get_rtn_data(ETF)
    return ETF
    ##### 

In [None]:
def ETF_backtest(df, index,col=['corr','reg_beta','reg_beta_tstats','r_squared','rtn','rtn_std','Sharpe','SuccessRate']):
    df = df.copy()
    
    rst = pd.DataFrame(columns=col)
    
    ETF = price_outlier_analysis(df=df, index=index)
    ETF = ETF.dropna()
    ETF = ETF.loc[ETF.date != '2016-11-07']
    
    ##### regression analysis
    corr = ETF[['r_open','r_close']].corr().loc['r_open','r_close']
    insample_stats, insample_model = get_regression_stats(ETF,'r_close','r_open')
    beta = insample_stats['beta_coef']
    beta_tstats = insample_stats['beta_tstats']
    r_square = insample_stats['r_square']
    
    ##### strategy analysis
    ETF = get_strat_rtn(ETF)
    backtest_metric = list(backtesting_metrics(ETF,'rtn',index).loc[index,:])
    
    rst.loc[index,:] = [corr, beta, beta_tstats, r_square]+backtest_metric
    return rst
    

In [None]:
def get_pca_variance_explained(df, index_list, component=False):
    df = df.copy()
    pca = PCA()
    X_pca = pca.fit_transform(df)

    n_components = len(pca.explained_variance_ratio_)
    explained_variance = pca.explained_variance_ratio_
    cum_explained_variance = np.cumsum(explained_variance)

    idx = np.arange(n_components)+1
    df_explained_variance = pd.DataFrame([explained_variance, cum_explained_variance], 
                                         index=['variance explained', 'cumulative variance explained'], 
                                         columns=idx).T
    
    eigenvector = pd.DataFrame(index=index_list,data = pca.components_.T,
                           columns=['PrincipleComponent'+str(i) for i in range(1,len(index_list)+1)]).T
    if component:
        return(df_explained_variance, pca.components_[0])
    else:   
        return (df_explained_variance, eigenvector)

    
def viz_pca_variance(df):
    fig, ax1 = plt.subplots(figsize=(15,6))
    idx = np.arange(len(df))+1
    ax1.set_title('Variance explained across principal components', fontsize=14)
    ax1.set_xlabel('Principal Component', fontsize=12)
    ax1.set_ylabel('Variance Explained', fontsize=12)
    ax2 = sns.barplot(x=idx, y='variance explained', data=df_explained_variance, palette='Set2')
    ax2 = ax1.twinx()
    ax2.grid(False)
    ax2.set_ylabel('Cumulative variance explained', fontsize=12)
    ax2 = sns.lineplot(x=idx, y='cumulative variance explained', data=df)
    plt.show()
    
def get_rolling_PCA(df,window, index_list, col=['variance_explained','variance_explained_2']):
    df = df.copy()

    rtn = pd.DataFrame(columns = col+index_list)
    
    for idx in range(len(df)-window):
        df_temp = df.iloc[idx:idx+window]
        currentdate = df_temp.index[-1]
        
        variance, vector = get_pca_variance_explained(df=df_temp, index_list=index_list, component=True)
        rtn.loc[currentdate,:] = [variance['cumulative variance explained'][1], variance['cumulative variance explained'][2]] + list(vector) 
    rtn = rtn.astype('float')
    return rtn

def get_sscore(df,x,y):
    X = sm.add_constant(df[x])
    y = df[y]
    model = sm.OLS(y,X)
    res = model.fit()
    resid = res.resid
    rtn = stats.zscore(resid)
    return rtn

def get_rolling_sscore(df, index_list, x='factor1'):
    df = df.copy()

    rtn = pd.DataFrame()
    
    for quarter in df.quarter.unique():
        df_temp = df.loc[df.quarter==quarter]
        for index in index_list:
            resid = get_sscore(df=df_temp,x=x,y=index)
            df_temp = df_temp.copy()
            df_temp.loc[:,index] = resid
        rtn = pd.concat([rtn,df_temp])
    return rtn

def index_tradingrtn(index, sscore, rtn, buyopen=1.25, sellopen=-1.25, buyclose=0.1,sellclose=0.1):
    sscore = sscore.copy()
    rtn = rtn.copy()
    
    priorsignal = []
    priordate = np.nan 
    index_rtn = [] 

    for date in sscore.date:
        score = sscore.loc[sscore.date == date,index].values[0]
        if priorsignal:
            if priorsignal[-1] == 1:
                if abs(score) < buyclose:
                    cumrtn = (1+ rtn.loc[(rtn.date >= priordate) & (rtn.date <= date),index]-rtn.loc[(rtn.date >= priordate) & (rtn.date <= date),'factor1']).cumprod().values[-1]-1
                    ##### add trade rtn into ETF return list - sell position
                    index_rtn.append(cumrtn)
                    priorsignal.pop()
            elif priorsignal[-1] == -1:
                if abd(score) < sellclose:
                    ##### add trade rtn into ETF return list - short position
                    cumrtn = (1+ rtn.loc[(rtn.date >= priordate) & (rtn.date <= date),index]-rtn.loc[(rtn.date >= priordate) & (rtn.date <= date),'factor1']).cumprod().values[-1]-1
                    cumrtn *= -1
                    index_rtn.append(cumrtn)
                    priorsignal.pop()
        else:
            if score >= buyopen:
                ##### long position entry - save position and corresponding date 
                priorsignal.append(1)
                priordate = date
            elif score <= sellopen:
                ##### short position entry - save position and corresponding date 
                priorsignal.append(-1)
                priorsignal.pop()
    return index_rtn

def annualized_sharpe(rtn,days=252):
    return round(days*(np.mean(rtn)/np.std(rtn))/len(rtn),2)

In [None]:
# std_scaler = StandardScaler()
# data = std_scaler.fit_transform(data)