In [None]:
import numpy as np
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta


In [None]:
def addmonth(start_date, n_of_month, dateformat):
    start=datetime.datetime.strptime(start_date,dateformat)
    return (start+relativedelta(months=n_of_month)).strftime(dateformat)

In [None]:
def selectMonth(start_date, n_of_month, dataframe):
    # dataframe should have index as "2021-01-01"
    dateformat='%Y-%m-%d'
    start=start_date
    temp1=addmonth(start_date, n_of_month, dateformat)
    temp2=datetime.datetime.strptime(temp1,dateformat)
    end=datetime.date(temp2.year,temp2.month,1).strftime(dateformat)
    return dataframe.loc[(dataframe.index>=start) & (dataframe.index<end)]

In [None]:
def returnList(start_date,n_of_month,list_of_dateframe):
    '''
    getting the required month from a list of dateframe and concatenate them in one dataFrame
    Eg: put together all selected months data of daily, weekly, biweekly, monthly return
    
    '''
    alist=[]
    for i in list_of_dateframe:
        alist.append(selectMonth(start_date, n_of_month, i))
    return pd.concat(alist)

In [None]:
def pctlList(dataframe):
    '''gives the 50, 25, 75 percentile of the dateframe'''
    df=pd.DataFrame(columns=dataframe.columns)
    pctl=['50','25','75']
    for k in pctl:
        df.loc[k+'_percentile']=[np.percentile(dataframe[i],int(k)) for i in dataframe.columns]
    return df

In [None]:
def basefunc(dataframe):
    '''getting the basefunc function
       pctlList required!
       drop columns with null value
       output of this function is a dataframe with each col containing tuple (1, phi1, phi2, phi3, phi4)
    '''
    null_cols=dataframe.columns[dataframe.isnull().any()]
    dataframe=dataframe.drop(null_cols,axis=1) # get rid of columns contains null
    df_pctl=pctlList(dataframe) # calculate percentiles
    
    # calculate base functions
    phi1=dataframe
    phi2=abs(dataframe-df_pctl.loc['50_percentile'])
    phi3=1/2*(abs(dataframe-df_pctl.loc['75_percentile'])
         -abs(dataframe-df_pctl.loc['25_percentile'])
         -df_pctl.loc['75_percentile']-df_pctl.loc['25_percentile'])+dataframe
    phi4=1/2*(abs(dataframe-df_pctl.loc['75_percentile'])
         +abs(dataframe-df_pctl.loc['25_percentile'])
         -df_pctl.loc['75_percentile']+df_pctl.loc['25_percentile'])
    const_col=[1 for i in range(len(dataframe))]
    
    # put base funcs in list and form a new df
    base_df=pd.DataFrame(columns=dataframe.columns,index=dataframe.index)
    for col in dataframe.columns:
        base_df[col]=list(zip(const_col,list(phi1[col]),list(phi2[col]),list(phi3[col]),list(phi4[col])))
    
    return base_df

In [None]:
def basefunc_long_term_pctl(dataframe,ETF_dataframe):
    '''
    calculate (phi0,phi1(pctl),phi2(pctl),phi3(pctl),phi4(pctl))
    with pctl as the long_term_pctl and phis' take the parameter from ETF factor (here is SPY)
    '''
    null_cols=dataframe.columns[dataframe.isnull().any()]
    dataframe=dataframe.drop(null_cols,axis=1) # get rid of columns contains null
    
    null_cols=ETF_dataframe.columns[ETF_dataframe.isnull().any()]
    ETF_dataframe=ETF_dataframe.drop(null_cols,axis=1)
    df_pctl=pctlList(ETF_dataframe) # calculate percentiles
    
    # calculate base functions
    phi1=dataframe
    phi2=abs(dataframe-df_pctl.loc['50_percentile'])
    phi3=1/2*(abs(dataframe-df_pctl.loc['75_percentile'])
         -abs(dataframe-df_pctl.loc['25_percentile'])
         -df_pctl.loc['75_percentile']-df_pctl.loc['25_percentile'])+dataframe
    phi4=1/2*(abs(dataframe-df_pctl.loc['75_percentile'])
         +abs(dataframe-df_pctl.loc['25_percentile'])
         -df_pctl.loc['75_percentile']+df_pctl.loc['25_percentile'])
    const_col=[1 for i in range(len(dataframe))]
    
    # put base funcs in list and form a new df
    base_df=pd.DataFrame(columns=dataframe.columns,index=dataframe.index)
    for col in dataframe.columns:
        base_df[col]=list(zip(const_col,list(phi1[col]),list(phi2[col]),list(phi3[col]),list(phi4[col])))
    
    return base_df

In [None]:
def dltMonthSplit(n, dataframe):
    '''
    remove rows of the nth month in the dataframe
    n should be within the range of n_of_month as in func selectMonth
    Output should be two dataframes:
    1. removed month sub_df and 2. leftover_df
    '''
    dateformat='%Y-%m-%d'
    first_index=dataframe.index[0] # '2011-02-01'
    last_index=dataframe.index[-1] # '2012-01-31'
    first_date=datetime.datetime.strptime(first_index,dateformat) # datetime.datetime(2011, 2, 1, 0, 0)
    last_date=datetime.datetime.strptime(last_index,dateformat)+datetime.timedelta(days=1) 
    # datetime.datetime(2012, 2, 1, 0, 0)
    
    # here n_of_month is 12 between ('2011-02-01', '2012-01-31')
    n_of_month=(last_date.year-first_date.year)*12+(last_date.month-first_date.month)
    first=datetime.date(first_date.year,first_date.month,1).strftime(dateformat)
    month_list=[addmonth(first,i,dateformat) for i in range(n_of_month)]
    
    if n>(n_of_month) or n<1:
        return 'nth month selected is out of range of the total number of months in dataframe'
    elif isinstance(n,(float,str,bool)):
        return 'data type wrong for n'
    else:
        start=month_list[n-1]
        end=addmonth(start,1,dateformat)
        dltMonth_df=dataframe.iloc[(dataframe.index>=start) & (dataframe.index<end)]
        lftMonth_df=dataframe.iloc[(dataframe.index<start) | (dataframe.index>=end)]
    return [dltMonth_df,lftMonth_df]

In [None]:
def CrossValidation(number_of_month): # need to have stock and ETF as the variable! Number_of_month has to be 12 now!
    '''
    have to have linear regressor defined before
    
    follow the way to calculate SCORE underneath. should return SCORE
    '''
    lr=LinearRegression()
    estimators={}
    residuals={}
    for i in range(1,number_of_month+1): # no need to start from 1, return list doesn't have res[0] anymore.change it!
        [X_dlt_data,X_lft_data]=dltMonthSplit(i,ETF_basefunc['SPY'])
        [y_dlt_data,y_lft_data]=dltMonthSplit(i,stock_data['AAPL'])
        estimators[i]=lr.fit(list(X_lft_data),y_lft_data)
        residuals[i]=lr.predict(list(X_dlt_data))-y_dlt_data
    return residuals

In [None]:
def score(factor='SPY',stock='AAPL'): # need to change later
    '''
    calculate the score of a ETF_factor toward a stock
    '''
    residuals=CrossValidation(12)# number_of_month has to be 12 now. factor should go into CrossValidation.
    sumlen_res=0
    sumsquare_res=0
    for i in range(1,len(residuals)+1):
        sumlen_res+=len(residuals[i])
        sumsquare_res+=sum(residuals[i]**2)
    
    sumsquare_stock=sum(stock_data[stock]**2)
    score=np.sqrt(np.max(sumsquare_stock/sumsquare_res-1,0))
    return score

In [None]:
def pctlListNew(dataframe,pctl): # ask for the percentil of 1% and 99%!!!!!
    '''gives the pctl list of percentile of the dateframe'''
    df=pd.DataFrame(columns=dataframe.columns)
    for k in pctl:
        df.loc[str(k)+'_percentile']=[np.percentile(dataframe[i],k) for i in dataframe.columns]
    return df