# Package

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import textwrap
import swifter
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

In [None]:
# Asia mapping reference
Asia_mapping=pd.read_excel(r'C:\Users\Eric.Li\OneDrive\Post result data\{0} 2007-2017.xlsx'.format('Asia'),\
                                       sheet_name='Mapping')
Asia_mapping=Asia_mapping.dropna(axis=1,how="all").set_index(Asia_mapping.columns[0])

# Data

In [None]:
def price_date_transform(CSV_date,index=False):
    '''
    Transform the CSV price style string into dateframe string style
    The CSV date follows US style which is MM/DD/YYYY
    '''
    if index==False:
        timestamp=pd.Timestamp(int(CSV_date[CSV_date.find("/",3)+1:]),
                            int(CSV_date[:CSV_date.find("/")]),
                            int(CSV_date[CSV_date.find("/",1)+1:CSV_date.find("/",3)]))
        return timestamp.strftime("%d/%b/%Y")
    else:
        timestamp=pd.Timestamp(int(CSV_date[-4:]),int(CSV_date[3:5]),int(CSV_date[:2]))
        return timestamp

In [None]:
def fundamental_date_transform(CSV_date):
    '''
    Transform the fundamental style string into dateframe string style
    CSV date follow following style yyyy-mm-dd or MM/DD/YYYY
    '''
    if '-' in CSV_date:
        timestamp=pd.Timestamp(int(CSV_date[:4]),
                            int(CSV_date[5:7]),
                            int(CSV_date[8:]))
    else:
        timestamp=pd.Timestamp(int(CSV_date[CSV_date.find("/",3)+1:]),
                               int(CSV_date[CSV_date.find("/",1)+1:CSV_date.find("/",3)]),
                               int(CSV_date[:CSV_date.find("/")]))        
    return timestamp

In [50]:
def CSV_price(region,start,end,VWAP):
    '''
    Grab the pricing data from CSV
    region= US, Europe,Asia,or Canada
    start,end are in year
    VWAP: boolean to determine if we read price_df or VWAP_df
    key is the reference to search
    return the target price dataframe with timestamp on the column; also index price and return dataframe
    '''
    
    # price dataframe
    mylist=[]
    
    def clean_csv(csv):
        ''' 
        csv is the read file from pandas
        '''
        data=csv.set_index("Ticker")
        adj_data=data.loc[[x for x in data.index if type(x)==str]].replace('#N/A N/A','').replace(' #N/A N/A ','').\
        replace('#N/A Invalid Security','')
        adj_data=adj_data.loc[[x for x in adj_data.index if len(x)>0]]
        return adj_data
    
    
    if VWAP==True:
        for year in range(start,end+1):
            csv=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_VWAP_{1}.csv".format(region,year)).dropna\
        (how='all',axis=0).dropna(how='all',axis=1).drop_duplicates()
            adj_data=clean_csv(csv)
            mylist.append(adj_data)
               
    else:
        
        csv=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_price_2007_2017.csv".format(region)).dropna\
        (how='all',axis=0).dropna(how='all',axis=1).drop_duplicates()
        adj_data=clean_csv(csv)
        mylist.append(adj_data)
        for year in range(2018,end+1):
            csv=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_price_{1}.csv".format(region,year)).dropna\
            (how='all',axis=0).dropna(how='all',axis=1).drop_duplicates()
            adj_data=clean_csv(csv)
            mylist.append(adj_data)

    price=pd.concat(mylist,axis=1,sort=True)
    price=price.apply(lambda x:pd.to_numeric(x),axis=1)
    price.columns=[price_date_transform(i) for i in price.columns]
    
    # index price dataframe
    csv_index=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_price_index.csv".format(region)).dropna\
    (how='all',axis=0)
    data_index=csv_index.set_index("Ticker").T
    price_index=data_index.replace('#N/A N/A','')
    price_index.columns=[price_date_transform(i) for i in price_index.columns]
    price_index=price_index.dropna(how="all",axis=0)
    
    # return data
    abs_return=price.diff(1,axis=1)/price.shift(1,axis=1)
    abs_return_index=price_index.diff(1,axis=1)/price_index.shift(1,axis=1)
    return price,abs_return,price_index,abs_return_index

In [61]:
def CSV_price_orig(region,start,end,VWAP):
    '''
    Grab the pricing data from CSV
    region= US, Europe,Asia,or Canada
    start,end are in year
    key is the reference to search
    return the target price dataframe with timestamp on the column
    '''
    mylist=[]
    for year in range(start,end+1):
        if VWAP==False:
            csv=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_price_{1}.csv".format(region,year)).dropna\
    (how='all',axis=0).dropna(how='all',axis=1)
        else:
            csv=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_VWAP_{1}.csv".format(region,year)).dropna\
    (how='all',axis=0).dropna(how='all',axis=1)
        data=csv.set_index("Ticker")
        adj_data=data.loc[[x for x in data.index if type(x)==str]].replace('#N/A N/A','').replace(' #N/A N/A ','').\
        replace('#N/A Invalid Security','')
        adj_data=adj_data.loc[[x for x in adj_data.index if len(x)>0]]
        mylist.append(adj_data)

    price=pd.concat(mylist,axis=1,sort=True)
    price=price.apply(lambda x:pd.to_numeric(x),axis=1)
    
    csv_index=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_price_index.csv".format(region)).dropna\
    (how='all',axis=0)
    data_index=csv_index.set_index("Ticker").T
    price_index=data_index.replace('#N/A N/A','')
    #price_index=data_index.apply(lambda x:pd.to_numeric(x),axis=1)
    
    price.columns=[price_date_transform(i) for i in price.columns]
    '''
    Need to sort the columns for index price, and then transform to date string
    '''
    price_index.columns=[price_date_transform(i) for i in price_index.columns]
    #price_index=price_index.reindex(sorted(price_index.columns))
    #price_index.columns=[i.strftime("%d/%b/%Y") for i  in price_index.columns]
    
    abs_return=price.diff(1,axis=1)/price.shift(1,axis=1)
    abs_return_index=price_index.diff(1,axis=1)/price_index.shift(1,axis=1)
    return price,abs_return,price_index,abs_return_index

In [None]:
def CSV_EPS(region,start,end):
    '''
    Grab the EPS data from CSV database
    region= US, Europe,Asia,or Canada
    start,end are in year
    key is the reference to search
    return the target EPS dataframe with timestamp on the column
    '''
    mylist=[]
    for year in range(start,end+1):
        csv=pd.read_csv(r"C:\Users\Eric.Li\OneDrive\Post result data\{0} CSV\{0}_EPS_{1}.csv".format(region,year))
        data=csv.set_index("Ticker")
        adj_data=data.loc[[x for x in data.index if type(x)==str]].replace('#N/A N/A','').replace(" #N/A N/A ","").dropna\
        (how='all',axis=0).dropna(how='all',axis=1)
        adj_data=adj_data.loc[[x for x in adj_data.index if len(x)>0]]
        mylist.append(adj_data)

    EPS=pd.concat(mylist,axis=1,sort=True)
    EPS=EPS.apply(lambda x:pd.to_numeric(x),axis=1)
    EPS.columns=[price_date_transform(i) for i in EPS.columns]
    return EPS

In [9]:
def CSV_fundamentals(region,price,EPS_df,revision_period,min_history,min_vol,use_cache):
    '''
    Grab the fundamental data from the spreadsheet
    region= US, Europe,Asia,or Canada
    return the post result fundamental dataframe
    use_cache: boolean, if yes we just read the last cache of fundamental_df
    
    '''
    
    if use_cache is True:
        try:
            data=pd.read_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}{1}_fundamental_df.csv'.format(region,\
                                                                                                        str(revision_period)))
            new_index=pd.MultiIndex.from_tuples(list(zip(data.iloc[:,0],data.iloc[:,1],data.iloc[:,2],data.iloc[:,3])))
            data.index=new_index
            target_data=data.iloc[:,4:]
        except:
            print("No such file!")
    else:
        # import the raw fundamental_df and clean up all the nonsense
        csv=pd.read_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_raw_fundamental_df.csv'.format(region))
        data=csv.set_index("Ticker").drop_duplicates().replace('#N/A Invalid Security','').\
        replace('#N/A Requesting Data...','')
        data=data[data.index!='']
        data=data.dropna(how="all")

        # Manipulate the data to get the next earning date, quarter end date, finally generate multi-index for the dataframe 
        data["date_copy"]=[fundamental_date_transform(i) for i in data["Date"].copy()]
        data["ticker_copy"]=data.index
        data=data.copy().sort_values(by=["ticker_copy","date_copy"])
        data["next_date"]=data["date_copy"].shift(-1)
        data["ticker_copy"]=data["ticker_copy"].shift(-1)
        data["Date"]=data["date_copy"].copy().swifter.apply(lambda x: x.strftime("%d/%b/%Y") if x!='' else np.nan)
        data["Orig date"]=data["Orig date"].copy().swifter.apply(lambda x: pd.Timestamp(x).strftime("%d/%b/%Y")\
                                                          if x!='' else np.nan)
        data["Next"]=data.swifter.apply(lambda x: x["next_date"].strftime("%d/%b/%Y") \
                                        if type(x["next_date"])==pd.Timestamp and \
                                x.name==x["ticker_copy"] else np.nan,axis=1)
        
#         data["period"]=data.apply(lambda x:str(pd.Timestamp(datetime.strptime(x["Date"],"%d/%b/%Y")).year)\
#                                             +" "+str(pd.Timestamp(datetime.strptime(x["Date"],"%d/%b/%Y")).quarter),\
#                                             axis=1)

        data["end_period"]=data.swifter.apply(lambda x: pd.offsets.BQuarterEnd().rollforward(x["date_copy"])\
                                              .strftime("%d/%b/%Y"),\
                                      axis=1)

        data.index=pd.MultiIndex.from_tuples(list(zip(data.index,data["Date"],data["Next"],data["end_period"])))

        del data["ticker_copy"]
        del data["date_copy"]
        del data["next_date"]
        del data["end_period"]
        del data["Next"]
        

        for s in ["Market cap","Volume"]:
            try:
                data[s]=pd.to_numeric(data[s])
            except KeyError:
                pass

        '''
        Add more forward look and realistic versions of earning revision
        '''
        data["Revision_real"]=data.swifter.apply(lambda x: revision_calc(x.name[0],x.name[1],EPS_df,(0,revision_period)),axis=1)
        data["Revision_20"]=data.swifter.apply(lambda x: revision_calc(x.name[0],x.name[1],EPS_df,(0,20)),axis=1)

        '''
        take out data with zero or none revision/market cap
        '''
        data=data[(data["Market cap"]>=500)] #universe above 500mn
        data=data[(data["Revision_20"]>=0)|(data["Revision_20"]<0)]
        
        '''
        take out cases where there is a short history
        '''
        count_history=data.swifter.apply(lambda x: price.loc[x.name[0],:x.name[1]][-2*min_history:].count() if x.name[1] in \
                                   price.columns and x.name[0] in price.index else None,axis=1)
        
        data=data.copy()[count_history>=min_history]
        
#         '''
#         Add momentum
#         '''
#         data["mom"]=data.apply(lambda x: np.log(price).loc[x.name[0],:x.name[1]][-260:-23].dropna()[-1]-\
#                                np.log(price).loc[x.name[0],:x.name[1]][-260:-23].dropna()[0] if x.name[0] in price.index\
#                                and price.loc[x.name[0],:x.name[1]][-260:-23].dropna().shape[0]!=0 else None, axis=1)
        
#         data=data[(data["mom"]>=0)|(data["mom"]<0)]
        
#         data["mom_short"]=data.apply(lambda x: np.log(price).loc[x.name[0],:x.name[1]][-24:-1].dropna()[-1]/\
#                                price.loc[x.name[0],:x.name[1]][-24:-1].dropna()[0] if x.name[0] in price.index and\
#                                      price.loc[x.name[0],:x.name[1]][-24:-1].dropna().shape[0]!=0 else None, axis=1)      

        '''
        Add historic volatility
        '''
        
        abs_return=price.diff(1,axis=1)/price.shift(1,axis=1)
        data["30d_vol"]=data.swifter.apply(lambda x: abs_return.loc[x.name[0],:x.name[1]][-31:-1].std() \
                                           if x.name[0] in abs_return.index\
                                   and abs_return.loc[x.name[0],:x.name[1]][-31:-1].dropna().shape[0]!=0 else None,axis=1)      
        data=data[data["30d_vol"]>=min_vol]
        
        '''
        Final cleaning and export the data
        '''
        target_data=data.drop_duplicates()
        target_data.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}{1}_fundamental_df.csv'.format(region,\
                                                                                                      str(revision_period)))
    return target_data

In [None]:
def data_enhancement(fundamental_df,price_df,index_df,EPS_df,EAR_tuple,post_return_period_list,
                     index_name,MA_period_list,return_lookback_list,range_list,use_cache,region):
    ''' 
    Enhance the existing fundamental data with more independent variaables to run machine learning
    
    *Parameters:
    fundamental_df: refined fundamental_df after CSV_fundamental function
    price_df: pandas dataframe, price data to calculate EAR, mom
    index_df: dataframe with market index data and exogeneous variables
    EPS_df: pandas dataframe, consensus EPS data to calculate revision
    EAR_tuple: (start_reference,number of days), e.g. (0,1) means day1 return
    post_return_period_list: list of periods used to calculate post return
    index_name:dict with keys -"index","yield_long","yield_short","inflation","currency","VIX"
    MA_period_list: list for periods used to calculate moving average deviation
    return_lookback_list:list for periods used to calculate past return abs
    range_list:list for periods used to calculate the chart position
    use_cache: boolean, if true then look for the exported csv
    region: hard coded region for export use
    
    *Dependent variable:
    post_return_net: relative return after EAR period for a list of periods
    success: if relative return is positive
    
    *Independent variables:
    adj_quarter: Q1 to Q4, string
    Q1-Q4: dummy variable 
    Year: 2010-2017, string
    2010-2017: dummy variable
    Month: Jan to Dec string
    Jan to Dec: dummy variable
    Day: early mid or late, string
    Early mid late: day dummy
    Supersector dummy: dummy transformation from Supersector
    
    EAR_net: 3 day relative return centered the earning day, float
    EAR_net_dummy: 1 if EAR_net above zero
    EAR_net_last: EAR_net in the previous quarter
    EAR_net_last_dummy: 1 if EAR_net_last above zero
    
    Revision_real: 2 day earning revision, overwrite any abosolute value above 90% to 90%
    Revision_dummy: 1 if Revision real is positive
    
    Market cap: log market cap value in US dollars
    mom/mom_short: log absolute stock return for last 52 weeks and last 4 weeks, up to the start of EAR period
    
    beta: market beta for 260 business days up to before the EAR period
    
    macro variables: value in yield,inflation,currency,VIX on the earning day
    commodity variables: value in gold,oil,bloomberg commodity index on the earning day
    
    momentum index: long and short,deviation from MA and past return on the end of EAR period
    market index: deviation from MA and past return on the end of EAR period
    stock technicals: deviation n from MA, and chart position on the end of EAR period
    
    '''

    if use_cache is True:
        try:
            data=pd.read_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
            new_index=pd.MultiIndex.from_tuples(list(zip(data.iloc[:,0],data.iloc[:,1],data.iloc[:,2],data.iloc[:,3])))
            data.index=new_index
            fundamental_df=data.iloc[:,4:]
        except:
            print("No such file!")
            
    else:
        
        #Target rel index used for return calculation
        index_series=index_df.loc[index_name["index"]]
        
        #Add next earning date as a separate column
        fundamental_df["Next date"]=fundamental_df.apply(lambda x:x.name[2],axis=1)
        
        #Dependent variable
        for i in post_return_period_list:
            fundamental_df["post_return_net_{0}".format(i)]=fundamental_df.swifter.\
            apply(lambda x:return_calc_log(x.name[0],x.name[1],price_df,index_series,(EAR_tuple[0]+EAR_tuple[1],i),'rel')\
                  ,axis=1)    
            
            fundamental_df["result_net_{0}".format(i)]=(fundamental_df["post_return_net_{0}".format(i)]>0)*1.0

            fundamental_df["post_return_net_{0}_last".format(i)]=fundamental_df.swifter.\
            apply(lambda x: fundamental_df.loc[x.name[0]].xs(x.name[1],level=1)["post_return_net_{0}".format(i)].iloc[0]\
                            if x.name[1] in list(fundamental_df.loc[x.name[0]]["Next date"]) else None,axis=1)    

            fundamental_df["result_net_{0}_last".format(i)]=(fundamental_df["post_return_net_{0}_last".format(i)]>0)*1.0

        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        def result_category(post_return):
            '''
            sub category for post result returns
            '''
            if post_return<-0.1:
                return "1"
            elif post_return<=-0.05:
                return "2"
            elif post_return<=-0.01:
                return "3"
            elif post_return<=0:
                return "4"
            elif post_return<=0.01:
                return "5"
            elif post_return<=0.05:
                return "6"
            elif post_return<=0.1:
                return "7"
            elif post_return>0.1:
                return "8"
            else:
                return None
        
        # split into 8 return category so that making 1% isn't the same as making 10%
        fundamental_df["result_category_net"]=fundamental_df.\
        apply(lambda x:result_category(x["post_return_net"]),axis=1)
        

        def quarter(month):
            '''
            month: three letter string
            '''
            if month in ["Jan","Feb","Mar"]:
                return "Q1"
            elif month in ["Apr","May",'Jun']:
                return "Q2"
            elif month in ["Jul","Aug","Sep"]:
                return "Q3"
            else:
                return "Q4"

        fundamental_df["adj_quarter"]=fundamental_df.swifter.apply(lambda x:quarter(x.name[1][3:6]),axis=1)

        fundamental_df["Year"]=fundamental_df.swifter.apply(lambda x: x.name[1][-4:],axis=1)

        fundamental_df["Month"]=fundamental_df.swifter.apply(lambda x:x.name[1][3:6],axis=1)


        def day_in_month(day):
            ''' 
            day: int
            '''
            if day<=10:
                return "Early"
            elif day<=20:
                return "Mid"
            else:
                return "Late"

        fundamental_df["Day"]=fundamental_df.swifter.apply(lambda x:day_in_month(int(x.name[1][:2])),axis=1)                         

        # Dummy variable 
        for i in fundamental_df["Supersector"].dropna().unique():
            fundamental_df[i]=(fundamental_df["Supersector"]==i)*1.0

        for i in fundamental_df["adj_quarter"].dropna().unique():
            fundamental_df[i]=(fundamental_df["adj_quarter"]==i)*1.0    

        for i in fundamental_df["Year"].dropna().unique():
            fundamental_df[i]=(fundamental_df["Year"]==i)*1.0  

        for i in fundamental_df["Month"].dropna().unique():
            fundamental_df[i]=(fundamental_df["Month"]==i)*1.0  

        for i in fundamental_df["Day"].dropna().unique():
            fundamental_df[i]=(fundamental_df["Day"]==i)*1.0          
        
        # EAR variable 
        fundamental_df["EAR_net"]=fundamental_df.swifter.apply(lambda x:return_calc_log(x.name[0],x.name[1],\
                                                                                                         price_df,index_series,\
                                                                                       EAR_tuple,'rel'),axis=1)
        
        fundamental_df["EAR_net_dummy"]=(fundamental_df["EAR_net"]>0)*1.0
        
        
        fundamental_df["EAR_net_last"]=fundamental_df.swifter.apply(lambda x: fundamental_df.loc[x.name[0]].\
                                                                    xs(x.name[1],level=1)["EAR_net"].iloc[0]\
                        if x.name[1] in list(fundamental_df.loc[x.name[0]]["Next date"]) else None,axis=1)
        
        fundamental_df["EAR_net_last_dummy"]=(fundamental_df["EAR_net_last"]>0)*1.0
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))

        #adjust reivision number
        fundamental_df["post_revision"]=fundamental_df["Revision_real"]
        
        fundamental_df.loc[fundamental_df["post_revision"]>=0.9,"post_revision"]=0.9
        fundamental_df.loc[fundamental_df["post_revision"]<=-0.9,"post_revision"]=-0.9

        fundamental_df["post_revision_dummy"]=(fundamental_df["post_revision"]>0)*1.0

        fundamental_df["post_revision_last"]=fundamental_df.swifter.apply(lambda x: fundamental_df.loc[x.name[0]].\
                                                                          xs(x.name[1],level=1)["post_revision"].iloc[0]\
                        if x.name[1] in list(fundamental_df.loc[x.name[0]]["Next date"]) else None,axis=1)
        
        fundamental_df["post_revision_last_dummy"]=(fundamental_df["post_revision_last"]>0)*1.0        
        
        fundamental_df["pre_revision"]=fundamental_df.swifter.apply(lambda x: \
                                                                    revision_calc(x.name[0],x.name[1],EPS_df,\
                                                                                        (-41,40)),axis=1)
        
        fundamental_df.loc[fundamental_df["pre_revision"]>=0.9,"pre_revision"]=0.9
        fundamental_df.loc[fundamental_df["pre_revision"]<=-0.9,"pre_revision"]=-0.9     
        
        fundamental_df["pre_revision_dummy"]=(fundamental_df["pre_revision"]>0)*1.0


        fundamental_df["Market cap"]=np.log(fundamental_df["Market cap"])
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        # Stock momentum up to the start of EAR period
        fundamental_df["stock_mom"]=fundamental_df.swifter.apply(lambda x:return_calc_log(x.name[0],x.name[1],price_df,\
                                                                                          index_series,\
                                                                                       (-280+EAR_tuple[0],260),'abs'),axis=1)
        
        fundamental_df["stock_mom_short"]=fundamental_df.swifter.apply(lambda x:
                                                                 return_calc_log(x.name[0],x.name[1],price_df,\
                                                                                 index_series,(-20+EAR_tuple[0],20),'abs'),axis=1)
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        #beta
        abs_return_df=price_df.diff(1,axis=1)/price_df.shift(1,axis=1)
        
        index_return_series=index_series/index_series.shift(1)-1
        
        fundamental_df["beta"]=fundamental_df.swifter.apply(lambda x:\
                                                    calc_beta(x.name[0],x.name[1],abs_return_df,\
                                                              index_return_series,260,EAR_tuple[0]),axis=1)
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        # macro variables
        fundamental_df["yield_long"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc[index_name["yield_long"],x.name[1]],axis=1)
        
        fundamental_df["yield_short"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc[index_name["yield_short"],x.name[1]],axis=1)
        
        fundamental_df["yield_slope"]=fundamental_df["yield_long"]-fundamental_df["yield_short"]
        
        fundamental_df["inflation"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc[index_name["inflation"],x.name[1]],axis=1)

        fundamental_df["currency"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc[index_name["currency"],x.name[1]],axis=1)

        fundamental_df["vix"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc[index_name["vix"],x.name[1]],axis=1)
        
        # commodity    
        fundamental_df["gold"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc["GOLDS Comdty",x.name[1]],axis=1)        
        
        fundamental_df["oil"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc["Cl1 Comdty",x.name[1]],axis=1) 
        
        fundamental_df["commodity"]=fundamental_df.swifter.\
        apply(lambda x:index_df.loc["BWMING Index",x.name[1]],axis=1)
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        #momentum index
        for s in MA_period_list: #calculate the MA on the end of EAR period
            fundamental_df["momentum_long_MA_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:MA_deviation(index_name["momentum_long"],x.name[1],index_df,s,EAR_tuple[0]+EAR_tuple[1]-1),axis=1)

            fundamental_df["momentum_short_MA_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:MA_deviation(index_name["momentum_short"],x.name[1],index_df,s,EAR_tuple[0]+EAR_tuple[1]-1),axis=1)
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        for s in return_lookback_list:#calculate the return up to the end of EAR Period
            fundamental_df["momentum_long_return_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:return_calc_log(index_name["momentum_long"],\
                                       x.name[1],index_df,index_df,(-s+EAR_tuple[0]+EAR_tuple[1]-1,\
                                                                    EAR_tuple[0]+EAR_tuple[1]-1),"abs"),axis=1)
            
            fundamental_df["momentum_short_return_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:return_calc_log(index_name["momentum_short"],\
                                       x.name[1],index_df,index_df,(-s+EAR_tuple[0]+EAR_tuple[1]-1,\
                                                                    EAR_tuple[0]+EAR_tuple[1]-1),"abs"),axis=1)
        
        # market
        for s in MA_period_list:#calculate the MA on the end of EAR period
            fundamental_df["market_MA_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:MA_deviation(index_name["index"],x.name[1],index_df,s,EAR_tuple[0]+EAR_tuple[1]-1),axis=1)
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        for s in return_lookback_list:#calculate the return up to the end of EAR Period
            fundamental_df["market_return_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:return_calc_log(index_name["index"],\
                                       x.name[1],index_df,index_df,(-s+EAR_tuple[0]+EAR_tuple[1]-1,\
                                                                    EAR_tuple[0]+EAR_tuple[1]-1),"abs"),axis=1)        
            
        # stock
        for s in MA_period_list:#calculate the MA on the end of EAR period
            fundamental_df["stock_MA_n_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:MA_n_deviation(x.name[0],x.name[1],price_df,s,EAR_tuple[0]+EAR_tuple[1]-1,x["30d_vol"]),axis=1)
        
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        
        for s in range_list:#calculate the range to the end of EAR Period
            fundamental_df["stock_chart_position_{0}".format(s)]=fundamental_df.swifter.\
            apply(lambda x:chart_position(x.name[0],x.name[1],price_df,s,EAR_tuple[0]+EAR_tuple[1]-1),axis=1) 

        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
        
        
        # Timestamp
        fundamental_df["Date_timstamp"]=fundamental_df.apply(lambda x:pd.Timestamp(x["Date"]),axis=1)
        fundamental_df.to_csv(r'C:\Users\Eric.Li\OneDrive\Post result data cache\{0}_reg_data.csv'.format(region))
    return fundamental_df
    

# Util function

In [1]:
def revision_calc(ticker,date,EPS_df,period_tuple):
    '''
    Calculate percentage revision from the period tuple
    EPS_df: EPS dataframe
    period_tuple: (start,length), start is the offset and return includes that day, length is number of days
    return calculation assumes enters on the price of the prior day to day when the count starts
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date
    
    
    date_series=EPS_df.columns.tolist()
    
    
    if date in date_series and ticker in EPS_df.index: 
        eps_series=EPS_df.loc[ticker]
        day0=date_series.index(date)
        
        if len(eps_series.iloc[:day0+period_tuple[0]].dropna())>0:
        
            if period_tuple[0]<0 and len(eps_series.loc[:date])-1<=abs(period_tuple[0]):
                start=eps_series.dropna().iloc[0]
            else:
                start=eps_series.iloc[:day0+period_tuple[0]].dropna().iloc[-1]        

            end=eps_series.iloc[:day0+period_tuple[0]+period_tuple[1]].dropna().iloc[-1]

            if start!=0:
                revision=(end-start)/abs(start)
            else:
                revision=None

            target_revision=revision
            return target_revision
        else:
            return None
    else:
        return None

In [2]:
def EPS_avg(ticker,date,EPS_df,period_tuple):
    '''
    Calculate average EPS for the period tuple
    EPS_df: EPS dataframe
    period_tuple: (start,length), start is the offset and return includes that day, length is number of days
    return calculation assumes enters on the price of the prior day to day when the count starts
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date
    
    
    date_series=EPS_df.columns.tolist()
    
    
    if date in date_series and ticker in EPS_df.index: 
        eps_series=EPS_df.loc[ticker]
        day0=date_series.index(date)
        
        try:
            if day0-1>-period_tuple[0] and not pd.isna(eps_series.iloc[day0+period_tuple[0]-1]) \
            and not pd.isna(eps_series.iloc[day0+period_tuple[0]+period_tuple[1]-1]) and \
            eps_series.iloc[day0+period_tuple[0]-1]!=0:#need to have value on both end
                target_series=eps_series.iloc[day0+period_tuple[0]-1:day0+period_tuple[0]+period_tuple[1]]

                eps_avg=target_series.mean()
            else:
                eps_avg=None
        except IndexError:#in case the end of the target return is beyond the end of the series
            eps_avg=None
    else:
        eps_avg=None
    return eps_avg

In [59]:
def return_calc(ticker,date,price_df,index_series,period_tuple,abs_rel):
    '''
    Calculate perccentage return from the period tuple
    abs_rel: bool, abs_return if assigned abs
    period_tuple: (start,length), start is the offset and return includes that day, length is number of days
    return calculation assumes enters on the price of the prior day to day when the count starts
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date
    
    date_series=price_df.columns.tolist()
    index_date_series=index_series.index.tolist()
    
    if date in date_series and ticker in price_df.index:
        
        price_series=price_df.loc[ticker]
        
        day0=date_series.index(date)
        
        try:
            if day0-1>-period_tuple[0] and not pd.isna(price_series.iloc[day0+period_tuple[0]-1]) \
            and not pd.isna(price_series.iloc[day0+period_tuple[0]+period_tuple[1]-1]) and \
            price_series.iloc[day0+period_tuple[0]-1]!=0:#need to have value on both end
                target_series=price_series.iloc[day0+period_tuple[0]-1:day0+period_tuple[0]+period_tuple[1]]

                abs_return=target_series[-1]/target_series[0]-1

                if abs_rel=="abs":
                    target_return=abs_return
                else:
                    day0_index=index_date_series.index(date)

                    if not pd.isna(index_series.iloc[day0_index+period_tuple[0]-1]) \
                    and not pd.isna(index_series.iloc[day0_index+period_tuple[0]-1])\
                    and index_series.iloc[day0_index+period_tuple[0]-1]!=0:
                        target_index_series=index_series.iloc[day0_index+period_tuple[0]-1:\
                                                                     day0_index+period_tuple[0]+period_tuple[1]]
                        index_return=target_index_series.iloc[-1]/target_index_series.iloc[0]-1
                        target_return=abs_return-index_return
                    else:
                        target_return=None
            else:
                target_return=None
        except IndexError:#in case the end of the target return is beyond the end of the series
            target_return=None
    else:
        target_return=None
    
    return target_return

In [60]:
def return_calc_log(ticker,date,price_df,index_series,period_tuple,abs_rel):
    '''
    Calculate log return from the period tuple
    abs_rel: bool, abs_return if assigned abs
    period_tuple: (start,length), start is the offset and return includes that day, length is number of days
    return calculation assumes enters on the price of the prior day to day when the count starts
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date
    
    date_series=price_df.columns.tolist()
    index_date_series=index_series.index.tolist()
    
    if date in date_series and ticker in price_df.index:
        
        price_series=price_df.loc[ticker]
        
        day0=date_series.index(date)
        
        
        try:
            if day0-1>-period_tuple[0] and not pd.isna(price_series.iloc[day0+period_tuple[0]-1]) \
            and not pd.isna(price_series.iloc[day0+period_tuple[0]+period_tuple[1]-1]): #need to have value on both end
                target_series=np.log(price_series.iloc[day0+period_tuple[0]-1:day0+period_tuple[0]+period_tuple[1]])
                abs_return=target_series[-1]-target_series[0]

                if abs_rel=="abs":
                    target_return=abs_return
                else:
                    day0_index=index_date_series.index(date)

                    if not pd.isna(index_series.iloc[day0_index+period_tuple[0]-1]) \
                    and not pd.isna(index_series.iloc[day0_index+period_tuple[0]-1]):
                        target_index_series=np.log(index_series.iloc[day0_index+period_tuple[0]-1:\
                                                                     day0_index+period_tuple[0]+period_tuple[1]])
                        index_return=target_index_series.iloc[-1]-target_index_series.iloc[0]
                        target_return=abs_return-index_return
                    else:
                        target_return=None
            else:
                target_return=None
        except IndexError: #in case the end of the target return is beyond the end of the series
            target_return=None
    else:
        target_return=None
    
    return target_return

In [44]:
def revision_vol(ticker,date,EPS_df,period):
    '''
    calculate vol of revision
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date
    
    revision_df=EPS_df.diff(1,axis=1)/EPS_df.shift(1,axis=1)
    revision_series=revision_df.loc[ticker].dropna()
    date_series=revision_series.index.tolist()
    
    if date in date_series:
        day0=date_series.index(date)

        if len(revision_series.iloc[:day0].dropna())<period:
            target_revision=revision_series.iloc[:day0]
        else:
            target_revision=revision_series.iloc[day0-period:day0]       
        return target_revision.std()
    else:
        return None

In [None]:
def quarter_transform(quarter):
    '''
    Take the raw quarter to Q1 to Q4
    '''
    if type(quarter)==float:
        adj_quarter=None
    else:
        
        if quarter[-2:]=='Q4' or quarter[-2:]==':A':
            adj_quarter='Q4'
        elif quarter[-2:]=='Q3' or quarter[-2:]=='C3':
            adj_quarter='Q3'
        elif quarter[-2:]=='Q2' or quarter[-2:]=='C2' or quarter[-2:]=='S1':
            adj_quarter='Q2'
        elif quarter[-2:]=='Q1' or quarter[-2:]=='C1':
            adj_quarter='Q1'
        else:
            adj_quarter=None
    return adj_quarter

In [55]:
def calc_beta(ticker,date,return_df,index_df,lookback,offset):
    '''
    calculate beta for individual stocks
    index_df: index return 
    the end index is -1 as we need to exclude the return on the earning day
    offset adjust the final date for calculation
    if history is less than lookback, return None
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date    
    
    date_series=return_df.columns.tolist()
    index_date_series=index_df.index.tolist()
    
    if date in date_series and date in index_date_series and ticker in return_df.index:     
    
        return_series=return_df.loc[ticker]
        day0=date_series.index(date)  
        day0_index=index_date_series.index(date)
        
        if day0+1>=lookback and day0_index+1>=lookback and len(return_series.iloc[:day0+offset+1].dropna())>0 and \
        len(return_series.iloc[day0-lookback+offset+1:day0+offset+1].dropna())==\
        len(index_df.iloc[day0_index-lookback+offset+1:day0_index+offset+1].dropna()):
            cov_matrix=np.cov(return_series.iloc[day0-lookback+offset+1:day0+offset+1],\
                              index_df.iloc[day0_index-lookback+offset+1:day0_index+offset+1])
            

            beta=cov_matrix[0][1]/cov_matrix[1][1]
        else:
            beta=None
    else:
        beta=None
    return beta

In [54]:
def MA_n_deviation(ticker,date,price_df,lookback,offset,vol):
    ''' 
    The function calculate the number of daily deviation from certain moving average
    date - transform to the string below
    price_df - price dataframe used to calculate the moving average
    lookback: business days
    vol - last 30 day volatility in daily form
    offset:int, use it as an addition to the date variable to base the target price
    if history is less than the lookback, return None
    '''
    
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date    
        
    date_series=price_df.columns.tolist()

    if date in date_series and ticker in price_df.index: 
        
        price_series=price_df.loc[ticker]
        day0=date_series.index(date)
        
        if day0+1>=lookback and len(price_series.iloc[:day0+offset+1].dropna())>0 and not pd.isna(price_series.iloc[day0+offset]):
            MA=price_series.iloc[day0-lookback+offset+1:day0+offset+1] 
            MA_n=(MA.iloc[-1]/MA.mean()-1)/vol 
        else:
            MA_n=None
    else:
        MA_n=None
    return MA_n
            

In [None]:
def MA_deviation(ticker,date,price_df,lookback,offset):
    ''' 
    The function calculate the percentage deviation from certain moving average
    date - transform to the string below
    price_df - price dataframe used to calculate the moving average
    lookback: business days
    vol - last 30 day volatility in daily form
    offset:int, use it as an addition to the date variable to base the target price
    if history is less than the lookback, return None
    '''
    
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date    
        
    date_series=price_df.columns.tolist()

    if date in date_series and ticker in price_df.index: 
        
        price_series=price_df.loc[ticker]
        day0=date_series.index(date)
        
        if day0+1>=lookback and len(price_series.iloc[:day0+offset+1].dropna())>0 and not pd.isna(price_series.iloc[day0+offset]):
            MA=price_series.iloc[day0-lookback+offset+1:day0+offset+1] 
            MA_ret=MA.iloc[-1]/MA.mean()-1
        else:
            MA_ret=None
    else:
        MA_ret=None
    return MA_ret
            

In [None]:
def chart_position(ticker,date,price_df,lookback,offset):
    '''
    The function calculate the position in a range, 100% being high, 0% being low
    date - transform to the string below
    price_df - price dataframe used to calculate the range
    lookback: business days
    offset:int, use it as an addition to the date variable to base the target price    
    if history is less than the lookback, return None
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date    
        
    date_series=price_df.columns.tolist()

    if date in date_series and ticker in price_df.index: 
        
        price_series=price_df.loc[ticker]
        day0=date_series.index(date)
        
        if day0+1>=lookback and len(price_series.iloc[:day0+offset+1].dropna())>0 and not pd.isna(price_series.iloc[day0+offset]):
            target_range=price_series.iloc[day0-lookback+offset+1:day0+offset+1] 
            position=(target_range.iloc[-1]-target_range.min())/(target_range.max()-target_range.min())
        else:
            position=None
    else:
        position=None
    return position    
    

# Signal functions

In [None]:
def EAR_calc(ticker,date,return_df,EAR_period,vol_lookback):
    '''
    Calculate EAR from ticker and reference date
    '''
    if type(date)==float:
        return None
    elif type(date)==pd.Timestamp:
        date=date.strftime("%d/%b/%Y")
    elif type(date)==str:
        date=date
    return_series=return_df.loc[ticker].dropna()
    date_series=return_series.index.tolist()
    if date in date_series:
        day0=date_series.index(date)
        post_series=return_series.iloc[day0:]
        pre_series=return_series.iloc[:day0]
        vol= return_series.iloc[day0-min(len(pre_series),vol_lookback+1):day0].std()
        ret=(return_series.iloc[day0:day0+EAR_period]+1).prod()-1
        nmove=ret/vol
        return nmove
    else:
        return None

In [None]:
def signal_vol(signal_column,return_df,vol_lookback):
    '''
    Calculate simple vol from signal tuple
    '''
    signal_series=return_df.loc[signal_column.name[0]]
    location=signal_series.index.tolist().index(signal_column.name[1])
    vol_range=min(vol_lookback,len(signal_series[:location]))
    signal_vol=signal_series[location-vol_range-1:location].std()
    return signal_vol

In [None]:
def index_vol(date,index_df,vol_lookback):
    '''
    Calculate simple vol from signal tuple
    '''
    location=index_df.index.tolist().index(date)
    vol_range=min(vol_lookback,len(index_df.iloc[:location]))
    signal_vol=index_df.iloc[location-vol_range-1:location].std()
    return signal_vol

In [None]:
def slice_universe(signal_df,start_datetime,end_datetime,old_position):
    '''
    Slice the signal_df, both the index and entry date have to be 
    '''
    

    signal_df=signal_df.loc[start_datetime:end_datetime]
    
    if old_position is True:  
        adj_signal_df=signal_df
    else:
        entry=signal_df.apply(lambda x:datetime.strptime(x.name[1],"%d/%b/%Y"),axis=0)
        period_evaluate=(entry>=start_datetime)&(entry<=end_datetime)
        adj_signal_df=signal_df.loc[:,period_evaluate]
    
    
    zero_index=pd.Series(1,index=pd.date_range(start_datetime,end_datetime,freq='B')).to_frame()
    adj_signal_df=pd.concat([adj_signal_df.drop_duplicates(),zero_index],axis=1).iloc[:,:-1]
    return adj_signal_df
    

In [None]:
def signal_filter_stop(signal_df,stop_level,return_df,vol_lookback,stop_type,index_df):
    '''
    Input - signal_df
    Get the updated signal df after the stop loss
    stop_type:abs,rel
    
    '''
    if stop_type=='abs':
        vol_row=signal_df.apply(lambda column:signal_vol(column,return_df,vol_lookback),axis=0)
        signal_cum_nmove=((1+signal_df).cumprod()-1).ffill()/vol_row
        signal_df_stop=signal_df[-(signal_cum_nmove.expanding().min().shift(1,axis=0)<-stop_level)]
    elif stop_type=='rel':
        if index_df.shape[1]==1:
            signal_count=signal_df.copy()
            signal_count[((signal_count)>0) | ((signal_count)<0)]=1.0
            signal_hedge=signal_count.apply(lambda x:x.multiply(index_df.iloc[:,0],axis=0))
            
            vol_row=signal_df.apply(lambda x:signal_vol(x,return_df,vol_lookback),axis=0)
            rel_signal_cum_nmove=((1+signal_df).cumprod()-(1+signal_hedge).cumprod()).ffill()/vol_row
            signal_df_stop=signal_df[-(rel_signal_cum_nmove.expanding().min().shift(1,axis=0)<-stop_level)]
        else:
            signal_count=signal_df.copy()
            signal_count[((signal_count)>0) | ((signal_count)<0)]=1.0
            signal_hedge=signal_count.apply(lambda x:x.multiply(index_df[Asia_mapping.loc[x.name[0][-2:]].iloc[0]],axis=0))
            
            vol_row=signal_df.apply(lambda x:signal_vol(x,return_df,vol_lookback),axis=0)
            rel_signal_cum_nmove=((1+signal_df).cumprod()-(1+signal_hedge).cumprod()).ffill()/vol_row
            signal_df_stop=signal_df[-(rel_signal_cum_nmove.expanding().min().shift(1,axis=0)<-stop_level)]            
            
    else:
        pass
        
    return signal_df_stop

In [None]:
def revision_adjusted_size(reference_signal_df,lower_revision,higher_revision,size_multiple,revision_row,revision_row_reference,\
                           gross,long):
    
    ''' 
    Use positive size
    '''
    
    lower_size=0.01
    higher_size=lower_size*size_multiple

    if long is True:
        size_row_reference=revision_row_reference.to_frame().copy().apply(lambda x: lower_size+(higher_size-lower_size)\
                                                      *(np.abs(x.iloc[0]-lower_revision))/np.abs(higher_revision-lower_revision) \
                                                      if np.abs(x.iloc[0])<=np.abs(higher_revision) else higher_size,axis=1)

        size_df_reference=(1+reference_signal_df).cumprod()*size_row_reference

        trial_gross=np.abs(size_df_reference.sum(axis=1).mean())
        new_lower_size=lower_size/(trial_gross*100/gross)
        new_higher_size=higher_size/(trial_gross*100/gross)

        size_row=revision_row.to_frame().copy().apply(lambda x: new_lower_size+(new_higher_size-new_lower_size)\
                                                      *(np.abs(x.iloc[0]-lower_revision))/np.abs(higher_revision-lower_revision) \
                                                      if np.abs(x.iloc[0])<=np.abs(higher_revision) else new_higher_size,axis=1)
    else:
        size_row_reference=revision_row_reference.to_frame().copy().apply(lambda x: lower_size+(higher_size-lower_size)\
                                                      *(np.abs(x.iloc[0]-lower_revision))/np.abs(higher_revision-lower_revision) \
                                                      if np.abs(x.iloc[0])<=np.abs(lower_revision) else higher_size,axis=1)

        size_df_reference=(1-reference_signal_df).cumprod()*size_row_reference

        trial_gross=np.abs(size_df_reference.sum(axis=1).mean())
        new_lower_size=lower_size/(trial_gross*100/gross)
        new_higher_size=higher_size/(trial_gross*100/gross)

        size_row=revision_row.to_frame().copy().apply(lambda x: new_lower_size+(new_higher_size-new_lower_size)\
                                                      *(np.abs(x.iloc[0]-lower_revision))/np.abs(higher_revision-lower_revision) \
                                                      if np.abs(x.iloc[0])<=np.abs(lower_revision) else new_higher_size,axis=1)

    return size_row, new_lower_size

In [None]:
def sizing(signal_df,reference_signal_df,gross,fundamental_df,new_signal,return_df,risk_parity,liquidity,capital,\
           revision_adjust,long):
    '''
    Use historical signal_df range to calculate the size row for the current signal_df range
    Idea is to use historical as a benchmark for future sizing
    '''
    
    fundamental_df=fundamental_df.copy().sort_index()
    vol_reference=reference_signal_df.apply(lambda x:signal_vol(x,return_df,30),axis=0).mean()
    vol_row=signal_df.apply(lambda x:signal_vol(x,return_df,30),axis=0)
    
    '''
    Revision row needs to be updated using reference_signal
    new sizing scheme is a linear function
    revision_adjust=(True/False,lower_revision,higher_revision,size_multiple)
    '''
    
    
    if revision_adjust[0] is True:
        
        if new_signal is True:     
            revision_row_reference=fundamental_df.loc[reference_signal_df.columns]["Revision_real"]
            revision_row=fundamental_df.loc[signal_df.columns]["Revision_real"]
        else:
            revision_row_reference=fundamental_df.loc[reference_signal_df.columns]["Revision_20"]
            revision_row=fundamental_df.loc[signal_df.columns]["Revision_20"]           
        
        if long is True and revision_adjust[1] is not None:
            lower_revision=revision_adjust[1][0]
            higher_revision=revision_adjust[1][1]
            size_multiple=revision_adjust[3]
        
            base_size,low_size=revision_adjusted_size(reference_signal_df,lower_revision,higher_revision,size_multiple,\
                                                      revision_row,revision_row_reference,gross,True)
            if risk_parity is True:
                size_row=signal_df.apply(lambda x: min(base_size[x.name]/(vol_row[x.name]/vol_reference),\
                                                       fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]*\
                                                       liquidity/capital),axis=0)
            else:
                size_row=signal_df.apply(lambda x: min(base_size[x.name], fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]\
                                                       *liquidity/capital),axis=0)            

        elif long is False and revision_adjust[2] is not None:
            lower_revision=revision_adjust[2][0]
            higher_revision=revision_adjust[2][1]
            size_multiple=revision_adjust[3]
        
            base_size,low_size=revision_adjusted_size(reference_signal_df,lower_revision,higher_revision,size_multiple,revision_row,\
                                             revision_row_reference,gross,False) 

            if risk_parity is True:
                size_row=signal_df.apply(lambda x: min(base_size[x.name]/(vol_row[x.name]/vol_reference),\
                                                       fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]*\
                                                       liquidity/capital),axis=0)
            else:
                size_row=signal_df.apply(lambda x: min(base_size[x.name], fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]\
                                                       *liquidity/capital),axis=0)

        else:
            size_row=None
            low_size=None

    elif revision_adjust[0]=='constant':
        if risk_parity is True:
            size_row=signal_df.apply(lambda x: min(revision_adjust[1]/(vol_row[x.name]/vol_reference),\
                                                   fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]*\
                                                   liquidity/capital),axis=0)
        else:
            size_row=signal_df.apply(lambda x: min(revision_adjust[1], fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]\
                                                   *liquidity/capital),axis=0)
        low_size=None

    else:
        number=reference_signal_df.count(axis=1).mean()
        avg_size=gross/100/number

        if risk_parity is True:
            size_row=signal_df.apply(lambda x: min(avg_size/(vol_row[x.name]/vol_reference),\
                                                   fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]*\
                                                   liquidity/capital),axis=0)
        else:
            size_row=signal_df.apply(lambda x: min(avg_size, fundamental_df.loc[x.name[0],x.name[1]]["Volume"].iloc[0]\
                                                   *liquidity/capital),axis=0)
        low_size=None
    return size_row,low_size

In [None]:
def trading_analytics_date(portfolio_cache):
    '''
    Key portfolio metrics from portfolio cache
    Feed into plot function
    '''
    
    ind_return=portfolio_cache[3]
    signal_count=len(ind_return)
    account_curve=portfolio_cache[1]
    
    if signal_count==0:
        return None,None,None,None,None,None,None
    else:
        mean_return=ind_return.mean()
        hit_rate=len(ind_return[ind_return>0])/len(ind_return)*1.0
        payoff_ratio=ind_return[ind_return>0].mean()/ind_return[ind_return<0].mean()*-1.0
        
        account_price=account_curve+1
        ann_vol=account_curve.diff(1).std()*(260**0.5)
        ann_ret=account_curve.diff(1).mean()*260 #we only make money on constant capital
        ann_sharpe=ann_ret/ann_vol
        
        max_dd=-((1+account_curve)-(1+account_curve).cummax(axis=0)).expanding().min().min()
        
        #low_date=(np.maximum.accumulate(account_curve)-account_curve).idxmax()
        #high_date=account_curve[:low_date].idxmax()
        #max_dd=1-(1+account_curve[low_date])/(1+account_curve[high_date])
        
        return signal_count,hit_rate,payoff_ratio,ann_ret,ann_vol,ann_sharpe,max_dd

In [None]:
def trading_analytics_simp(account_curve):
    '''
    Key portfolio metrics from portfolio account curve
    Only sharpe and drawdown
    '''


    account_price=account_curve+1
    ann_vol=acciybt_curve.diff(1).std()*(260**0.5)
    ann_ret=account_curve.diff(1).mean()*260 #we only make money on constant capital
    ann_sharpe=ann_ret/ann_vol

    max_dd=-((1+account_curve)-(1+account_curve).cummax(axis=0)).expanding().min().min()

    #low_date=(np.maximum.accumulate(account_curve)-account_curve).idxmax()
    #high_date=account_curve[:low_date].idxmax()
    #max_dd=1-(1+account_curve[low_date])/(1+account_curve[high_date])

    return ann_sharpe,max_dd

In [None]:
def plot_signal(title,figsize,portfolio_cache):

    account_curve=portfolio_cache[1]
    avg_size=np.abs(portfolio_cache[2]).mean(axis=0).mean()
    ind_return=portfolio_cache[3]
    gross=portfolio_cache[4]
    turnover=portfolio_cache[5]

    fig=plt.figure(figsize=figsize)
    ax1=fig.add_subplot(1,1,1)
    ln1=ax1.plot(account_curve,label='signal',color='b')

    val1=ax1.get_yticks()
    start=val1[0]
    end=val1[-1]
    ax1.set_yticks(np.arange(start,end,0.1))  
    adj_val1=ax1.get_yticks()
    ax1.set_yticklabels(["{:.1%}".format(x) for x in adj_val1])

    ax2=ax1.twinx()
    ln2=ax2.plot(gross,label='gross',color='silver')

    val2=ax2.get_yticks()
    start=val2[0]
    end=val2[-1]
    ax2.set_yticks(np.arange(start,end,0.3))  
    adj_val2=ax2.get_yticks()
    ax2.set_yticklabels(["{:.0%}".format(x) for x in adj_val2])

    count,hit,payoff,ret,vol,sharpe,max_dd=trading_analytics_date(portfolio_cache)

    plt.title("\n".join(textwrap.wrap('count='+str(count)+
                             ',avg_size='+str("{:.1%}".format(avg_size))+
                             ',hit_rate='+str("{:.0%}".format(hit))+
                             ',payoff='+str(round(payoff,1))+
                             ',return='+str("{:.1%}".format(ret))+
                             ',vol='+str("{:.1%}".format(vol))+
                             ',sharpe='+str(round(sharpe,1))+
                             ',turnover='+str(round(turnover,1))+'x'+                             
                             ',max_drawdown='+str("{:.1%}".format(max_dd)))),fontsize=10)

    ax1.set_xlabel('Year')
    ax1.set_ylabel('Return')
    ax2.set_ylabel('Exposure')
    plt.suptitle(title,y=1.05,fontsize=16)
    plt.grid(linestyle='dashed')
    plt.legend(ln1+ln2,[l.get_label() for l in ln1+ln2],loc=2)
    ax1.axhline(y=0,color='k')

    plt.show()        

# Signal class

In [7]:
def signal_base(fundamental_df,long_criteria_dict,short_criteria_dict,EAR_tuple):
    '''
    class to produce long and short signals to trade
    fundamental_df - dataframe to conduct the filtering
    long_criteria_dict - {variable_name;condition tuple}
    EAR_tuple - if EAR_n is available, then (price_df,index_series,start,length,abs_rel)
    '''
    long_base=fundamental_df.copy()
    short_base=fundamental_df.copy()
    
    if long_criteria_dict is not None:
        for i in long_criteria_dict.items():
            if i[0]=="EAR_n":
                EAR=long_base.swifter.\
                apply(lambda x:return_calc(x.name[0],x.name[1],EAR_tuple[0],EAR_tuple[1],(EAR_tuple[2],EAR_tuple[3]),\
                                           EAR_tuple[4])if x.name[0] in EAR_tuple[0].index \
                      else None,axis=1)
                
                EAR_n=long_base.swifter.\
                apply(lambda x:EAR.loc[x.name]/x["30d_vol"] if x["30d_vol"]!=0 and EAR.loc[x.name] is not None \
                      else None,axis=1)
                
                long_base=long_base[(EAR_n>=i[1][0]) & (EAR_n<=i[1][1])]
            else:
                long_base=long_base[(long_base[i[0]]>=i[1][0]) & (long_base[i[0]]<=i[1][1])]
                
    else:
        long_base=None
        
    if short_criteria_dict is not None:
        for i in short_criteria_dict.items():
            if i[0]=="EAR_n":
                EAR_short=short_base.swifter.\
                apply(lambda x:return_calc(x.name[0],x.name[1],EAR_tuple[0],EAR_tuple[1],(EAR_tuple[2],EAR_tuple[3]),\
                                           EAR_tuple[4])if x.name[0] in EAR_tuple[0].index \
                      else None,axis=1)
                
                EAR_n_short=short_base.swifter.\
                apply(lambda x:EAR_short.loc[x.name]/x["30d_vol"] if x["30d_vol"]!=0 and EAR_short.loc[x.name] is not None \
                      else None,axis=1)
                                
                short_base=short_base[(EAR_n_short>=i[1][0]) & (EAR_n_short<=i[1][1])]
            else:
                short_base=short_base[(short_base[i[0]]>=i[1][0]) & (short_base[i[0]]<=i[1][1])]
                
    else:
        short_base=None    
    
    return long_base,short_base

    
    
    

In [8]:
class signal(object):
    '''
    Signal class is built to initialize the signal_df and account curve from base parameters 
    '''
    def __init__(self,fundamental_df,price_df,close_return_df,entry,base,holding,start,end,\
                 old_position,new_signal,revision_adjust,early_exit):
        '''
        Define the key free parameters of the signal
        Criteria:(EAR,revision,size)
        revision_adjust:(True/False,(long lower revision,long higher revision),(short low abs revision,short high abs_revision)\
        ,size_multiple)
        Entry: 1 means enter on the earning day, 2 means enters on whatever price given on the day after earning
        '''
        self.base=base
        self.long_base=base[0]
        self.short_base=base[1]
        self.fundamental_df=fundamental_df
        self.price_df=price_df
        self.abs_return=price_df.diff(1,axis=1)/price_df.shift(1,axis=1)
        self.close_return_df=close_return_df
        #self.EAR_period=EAR_period
        self.entry=entry
#         self.long_criteria=long_criteria
#         self.short_criteria=short_criteria
        self.holding=holding
        self.start=start
        self.end=end
        self.old_position=old_position
        self.new_signal=new_signal
        self.revision_adjust=revision_adjust
        self.early_exit=early_exit
        
#     def signal_base(self):
#         '''
#         Filter the signal criteria like EAR and revision, get the target signal list
#         From the fundamental information
#         For both long and short side
#         '''
        
#         long_base=self.fundamental_df.copy()
#         short_base=self.fundamental_df.copy()
        

#         long_base["EAR"]=long_base.apply(lambda x:EAR_calc(x.name[0],x.name[1],self.close_return_df,self.EAR_period,30)\
#                                      if x.name[0] in self.close_return_df.index else None,axis=1)

#         short_base["EAR"]=short_base.apply(lambda x:EAR_calc(x.name[0],x.name[1],self.close_return_df,self.EAR_period,30)\
#                                      if x.name[0] in self.close_return_df.index else None,axis=1)


#         if self.long_criteria is None:
#             long_base=None
#         else:
            
#             if self.long_criteria[1] is None:
#                 pass
#             else:
#                 if self.new_signal is True:
#                     long_base=long_base[(long_base["Revision_real"]>self.long_criteria[1][0])\
#                                                     &(long_base["Revision_real"]<self.long_criteria[1][1])]
#                 else:
#                     long_base=long_base[(long_base["Revision_20"]>self.long_criteria[1][0])\
#                                                     &(long_base["Revision_20"]<self.long_criteria[1][1])]                    
                
#             if self.long_criteria[2] is None:
#                 pass
#             else:
#                 long_base=long_base[(long_base["Market cap"]>self.long_criteria[2][0])&\
#                                         (long_base["Market cap"]<self.long_criteria[2][1])]
            
#             if self.long_criteria[0] is None:
#                 pass
#             else:
#                 long_base=long_base[(long_base["EAR"]>self.long_criteria[0][0])&(long_base["EAR"]<self.long_criteria[0][1])]
                
                
#         if self.short_criteria is None:
#             short_base=None
#         else:
            
#             if self.short_criteria[1] is None:
#                 pass
#             else:
#                 if self.new_signal is True:
#                     short_base=short_base[(short_base["Revision_real"]>self.short_criteria[1][0])\
#                                                     &(short_base["Revision_real"]<self.short_criteria[1][1])]
#                 else:
#                     short_base=short_base[(short_base["Revision_20"]>self.short_criteria[1][0])\
#                                                     &(short_base["Revision_20"]<self.short_criteria[1][1])]                    
                
#             if self.short_criteria[2] is None:
#                 pass
#             else:
#                 short_base=short_base[(short_base["Market cap"]>self.short_criteria[2][0])&\
#                                         (short_base["Market cap"]<self.short_criteria[2][1])]
            
#             if self.short_criteria[0] is None:
#                 pass
#             else:
#                 short_base=short_base[(short_base["EAR"]>self.short_criteria[0][0])&\
#                                       (short_base["EAR"]<self.short_criteria[0][1])]
        
#         self.long_base=long_base
#         self.short_base=short_base
        
#         return long_base,short_base
    
    def signal_df_date(self):#if we hold them through next earning
        '''
        Obtain the signal_df function over the whole time period from the target signal list
        '''
        
        if self.long_base is None:
            long_df=None
            self.long_df=long_df
        
        else:
            long_base=self.long_base.copy()
            long_df=pd.DataFrame(index=self.price_df.columns)

            for s in long_base.index:
                return_series=self.abs_return.loc[s[0]]
                if s[1] in return_series.index:
                    if not np.isnan(return_series.loc[s[1]]): 
                        day0=return_series.index.tolist().index(s[1])
                        
                        if self.early_exit is True and datetime.strptime(s[1],"%d/%b/%Y").date()>=\
                        datetime.strptime(s[3],"%d/%b/%Y").date():
                            period=None
                        
                        elif self.early_exit is True and type(s[2])!=float and datetime.strptime(s[1],"%d/%b/%Y").date()<\
                        datetime.strptime(s[3],"%d/%b/%Y").date():
                            period=min(self.holding,np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[2],"%d/%b/%Y").date())-self.entry,
                                      np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[3],"%d/%b/%Y").date())-self.entry+1)                            
                        
                        elif self.early_exit is True and type(s[2])==float and datetime.strptime(s[1],"%d/%b/%Y").date()<\
                        datetime.strptime(s[3],"%d/%b/%Y").date():
                            period=min(self.holding,np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[3],"%d/%b/%Y").date())-self.entry+1)
                            
                        elif type(s[2])==float:##basically np.nan has type float
                            period=self.holding+1
                        
                        else: ##assume that we are not holding through numbers
                            period=min(self.holding,np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[2],"%d/%b/%Y").date())-self.entry)
                            
                        if period is None:
                            pass
                        else:
                            target_series=return_series.iloc[day0+self.entry-1:day0+min(period+self.entry, \
                                                                                           len(return_series[day0:]))].dropna()
                            
                            if len(target_series)==0:
                                pass
                            else:
                                target_series.iloc[0]=0.0
                                long_df[s]=target_series                        

                    else:
                        pass
                else:
                    pass
                
            long_df=long_df.reindex(datetime.strptime(i,"%d/%b/%Y") for i in long_df.index)
            long_df=long_df.sort_index()

            if self.start is not None:
                long_df=slice_universe(long_df,self.start,self.end,self.old_position)
            else:
                pass
            
            long_df=long_df.dropna(how="all",axis=1)
            long_df.columns=pd.MultiIndex.from_tuples(pd.Series(list(long_df.columns)))
            self.long_df=long_df
            
            
        
        if self.short_base is None:
            short_df=None
            self.short_df=short_df
        
        else:
            short_base=self.short_base.copy()
            short_df=pd.DataFrame(index=self.price_df.columns)

            for s in short_base.index:
                return_series=self.abs_return.loc[s[0]]
                if s[1] in return_series.index:
                    if not np.isnan(return_series.loc[s[1]]): 
                        day0=return_series.index.tolist().index(s[1])
                        
                        if self.early_exit is True and datetime.strptime(s[1],"%d/%b/%Y").date()>=\
                        datetime.strptime(s[3],"%d/%b/%Y").date():
                            period=None
                            
                        elif self.early_exit is True and type(s[2])!=float and datetime.strptime(s[1],"%d/%b/%Y").date()<\
                        datetime.strptime(s[3],"%d/%b/%Y").date():
                            period=min(self.holding,np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[2],"%d/%b/%Y").date())-self.entry,
                                      np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[3],"%d/%b/%Y").date())-self.entry+1)  
                            
                        elif self.early_exit is True and type(s[2])==float and datetime.strptime(s[1],"%d/%b/%Y").date()<\
                        datetime.strptime(s[3],"%d/%b/%Y").date():
                            period=min(self.holding,np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[3],"%d/%b/%Y").date())-self.entry+1)
                        elif type(s[2])==float:##basically np.nan has type float
                            period=self.holding
                    
                        else: ##assume that we are not holding through numbers
                            period=min(self.holding,np.busday_count(datetime.strptime(s[1],"%d/%b/%Y").date(),\
                                                                    datetime.strptime(s[2],"%d/%b/%Y").date())-self.entry)\


                        if period is None:
                            pass
                        else:
                            target_series=return_series.iloc[day0+self.entry-1:day0+min(period+self.entry, \
                                                                                           len(return_series[day0:]))].dropna()
                            
                            if len(target_series)==0:
                                pass
                            else:
                                target_series.iloc[0]=0.0
                                short_df[s]=target_series         
                            
                    else:
                        pass
                else:
                    pass

            short_df=short_df.reindex(datetime.strptime(i,"%d/%b/%Y") for i in short_df.index)
            short_df=short_df.sort_index()

            if self.start is not None:
                short_df=slice_universe(short_df,self.start,self.end,self.old_position)
            else:
                pass  
            
            short_df=short_df.dropna(how="all",axis=1)
            short_df.columns=pd.MultiIndex.from_tuples(pd.Series(list(short_df.columns)))
            self.short_df=short_df
        
        return long_df,short_df

    def signal_account(self,stop,gross,index_df,net_level,risk_parity,liquidity,capital):
        '''
        Build the account curve with signal_df
        Assume quarterly rebalancing that's why the period list has quarter as the key
        Take extra care when building the account curve, the logic is: work out the size_df, then shift by 1 and * signal_df
        Stop=(long_stop,short_stop,type)
        index_df has to be a dataframe with a name
        '''
        
        try:
            long_df=self.long_df.copy()
            short_df=self.short_df.copy()
            
        except:
            long_df,short_df=self.signal_df_date()
                   
        '''
        Assign values for later use
        '''
        
        self.capital=capital
        
        self.index_df=index_df
        self.index_df.index=[datetime.strptime(i,"%d/%b/%Y") for i in self.index_df.index]
       
        
        '''
        Define rebalance period first
        '''
        if long_df is None:
            period=short_df.apply(lambda x:str(x.name.year)+" "+str(x.name.quarter),axis=1)
            period_list=list(set(period))
            period_list.sort()    
            
        else:
            period=long_df.apply(lambda x:str(x.name.year)+" "+str(x.name.quarter),axis=1)
            period_list=list(set(period))
            period_list.sort()    
                
        '''
        Separate out long and short
        '''
        if long_df is None:
            long_cache=(None,None,None,None)
        else:
            if stop is None:
                pass
            else:
                long_df=signal_filter_stop(long_df,stop[0],self.abs_return,30,stop[2],self.index_df)   
                self.long_df=long_df

            long_sub_signal={}
            long_sub_size_row={}
            long_sub_size_df={}
            long_sub_pnl={}
        
            for s in period_list:
            
                long_sub_signal[s]=long_df[period==s].dropna(how='all',axis=1)
                
                if long_sub_signal[s].shape[1]==0:
                    long_sub_size_df[s]=long_sub_signal[s]
                    long_sub_pnl[s]=long_sub_signal[s]
                    
                else:
                    if period_list.index(s)<4:##use last quarter's sizing as reference
                        long_sub_size_row[s]=sizing(long_sub_signal[s],long_sub_signal[s],gross[0],self.fundamental_df,\
                                                       self.new_signal,self.close_return_df,risk_parity,liquidity,capital,\
                                                    self.revision_adjust,True)[0]
                        
                    else:
                        try:
                            long_sub_size_row[s]=sizing(long_sub_signal[s],long_sub_signal\
                                                        [period_list[period_list.index(s)-1]],\
                                                       gross[0],self.fundamental_df,self.new_signal,self.close_return_df,\
                                                        risk_parity,liquidity,capital,self.revision_adjust,True)[0]
                        except:
                            long_sub_size_row[s]=sizing(long_sub_signal[s],long_sub_signal[s],\
                                                       gross[0],self.fundamental_df,self.new_signal,self.close_return_df,\
                                                        risk_parity,liquidity,capital,self.revision_adjust,True)[0] 

                    long_sub_size_df[s]=(1+long_sub_signal[s]).cumprod()*long_sub_size_row[s]
                    long_sub_pnl[s]=(long_sub_size_df[s].shift(1))*long_sub_signal[s] 
                    # need to shift by 1 as the size is end of the day
        
            long_daily_pnl=pd.concat(list(long_sub_pnl.values()),axis=0)
            long_acct_curve=long_daily_pnl.cumsum().ffill().sum(axis=1)
            long_size_df=pd.concat(list(long_sub_size_df.values()),axis=0)
            
            long_ind_return=long_daily_pnl.cumsum().ffill().iloc[-1].dropna()
            long_cache=(long_daily_pnl,long_acct_curve,long_size_df,long_ind_return)
            
            self.long_cache=long_cache

            
        if short_df is None:
            short_cache=(None,None,None,None)
        else:
            if stop is None:
                pass
            else:
                short_df=-signal_filter_stop(-short_df,stop[1],self.abs_return,30,stop[2],self.index_df)   
                self.short_df=short_df

            short_sub_signal={}
            short_sub_size_row={}
            short_sub_size_df={}
            short_sub_pnl={}
        
            for s in period_list:
                short_sub_signal[s]=short_df[period==s].dropna(how='all',axis=1)

                if short_sub_signal[s].shape[1]==0:
                    short_sub_size_df[s]=short_sub_signal[s]
                    short_sub_pnl[s]=short_sub_signal[s]
                    
                else:
                    
                    if period_list.index(s)<4:##use last quarter's sizing as reference
                        short_sub_size_row[s]=-sizing(short_sub_signal[s],short_sub_signal[s],gross[1],self.fundamental_df,\
                                                       self.new_signal,self.close_return_df,risk_parity,liquidity,capital,\
                                                      self.revision_adjust,False)[0]
                    else:
                        try:
                            short_sub_size_row[s]=-sizing(short_sub_signal[s],short_sub_signal\
                                                          [period_list[period_list.index(s)-1]],\
                                                           gross[1],self.fundamental_df,self.new_signal,self.close_return_df,\
                                                          risk_parity,liquidity,\
                                                          capital,self.revision_adjust,False)[0]
                        except:
                            short_sub_size_row[s]=-sizing(short_sub_signal[s],short_sub_signal[s],\
                                                           gross[1],self.fundamental_df,self.new_signal,self.close_return_df,\
                                                          risk_parity,liquidity,\
                                                          capital,self.revision_adjust,False)[0]

                    short_sub_size_df[s]=(1+short_sub_signal[s]).cumprod()*short_sub_size_row[s]
                    short_sub_pnl[s]=(short_sub_size_df[s].shift(1))*short_sub_signal[s] 
                # need to shift by 1 as the size is end of the day
        
            short_daily_pnl=pd.concat(list(short_sub_pnl.values()),axis=0)
            short_acct_curve=short_daily_pnl.cumsum().ffill().sum(axis=1)
            short_size_df=pd.concat(list(short_sub_size_df.values()),axis=0)
            short_ind_return=short_daily_pnl.cumsum().ffill().iloc[-1].dropna()
            
            short_cache=(short_daily_pnl,short_acct_curve,short_size_df,short_ind_return)
            self.short_cache=short_cache
    
        '''Put alpha positions together to form the alpha part'''
        alpha_df=pd.concat([long_df,short_df],axis=1)
        self.alpha_df=alpha_df
        
        alpha_daily_pnl=pd.concat([long_cache[0],short_cache[0]],axis=1)
        alpha_acct_curve=alpha_daily_pnl.cumsum().ffill().sum(axis=1)
        alpha_size_df=pd.concat([long_cache[2],short_cache[2]],axis=1)
        alpha_ind_return=pd.concat([long_cache[3],short_cache[3]],axis=0)
        
        alpha_cache=(alpha_daily_pnl,alpha_acct_curve,alpha_size_df,alpha_ind_return)
        

        
        if self.index_df is not None:
            if self.index_df.shape[1]==1:
                index_df=self.index_df.copy().reindex(alpha_df.index) 
                if net_level=='beta':
                    long_beta=long_size_df.apply(lambda x:calc_beta(x.name[0],x.name[1],self.close_return_df,\
                                                                    self.index_df.iloc[:,0],260),axis=0)
                    short_beta=short_size_df.apply(lambda x:calc_beta(x.name[0],x.name[1],self.close_return_df,\
                                                                      self.index_df.iloc[:,0],260),axis=0)
                    beta_adj_alpha_size_df=pd.concat([long_size_df*long_beta,short_size_df*short_beta],axis=1)
                    index_size_df=(0-beta_adj_alpha_size_df.sum(axis=1)).to_frame(index_df.columns[0])
                elif net_level=='vol':
                    
                    
                    long_vol=long_size_df.apply(lambda x:signal_vol(x,self.close_return_df,260)/\
                                                index_vol(datetime.strptime(x.name[1], "%d/%b/%Y"),self.index_df.iloc[:,0],30)\
                                                if datetime.strptime(x.name[1], "%d/%b/%Y") in self.index_df.index\
                                                else signal_vol(x,self.close_return_df,260), axis=0)
                    short_vol=short_size_df.apply(lambda x:signal_vol(x,self.close_return_df,260)/\
                                                index_vol(datetime.strptime(x.name[1], "%d/%b/%Y"),self.index_df.iloc[:,0],30)\
                                                  if datetime.strptime(x.name[1], "%d/%b/%Y") in self.index_df.index\
                                                else signal_vol(x,self.close_return_df,260), axis=0)
                    self.long_vol=long_vol
                    self.short_vol=short_vol
                    vol_adj_alpha_size_df=pd.concat([long_size_df*long_vol,short_size_df*short_vol],axis=1)
                    index_size_df=(0-vol_adj_alpha_size_df.sum(axis=1)).to_frame(index_df.columns[0])
                else:
                    index_size_df=(net_level-alpha_size_df.sum(axis=1)).to_frame(index_df.columns[0])
                index_daily_pnl=index_size_df.shift(1)*index_df
                index_acct_curve=index_daily_pnl.cumsum()
                index_ind_return=index_acct_curve.iloc[-1]
                index_cache=(index_daily_pnl,index_acct_curve,index_size_df,index_ind_return)
            else:
                index_df=self.index_df.copy().reindex(alpha_df.index) 
                alpha_size_temp=alpha_cache[2].copy().T
                if net_level=='beta':
                    long_beta=long_size_df.apply(lambda x:calc_beta(x.name[0],x.name[1],self.close_return_df,\
                                                                    self.index_df.loc[:,Asia_mapping.loc[x.name[0][-2:]].\
                                                                                      iloc[0]],260),axis=0)
                    alpha_size_temp=(alpha_cache[2].copy()*long_beta).T
                    alpha_size_temp["index"]=alpha_size_temp.apply(lambda x:Asia_mapping.loc[x.name[0][-2:]].iloc[0],axis=1)
                    index_size_df=0-alpha_size_temp.groupby("index").apply(sum).T.iloc[:-1]
                    
                    self.long_beta=long_beta
                else:
                    
                    alpha_size_temp=alpha_cache[2].copy().T
                    alpha_size_temp["index"]=alpha_size_temp.apply(lambda x:Asia_mapping.loc[x.name[0][-2:]].iloc[0],axis=1)
                    index_size_df=net_level-alpha_size_temp.groupby("index").apply(sum).T.iloc[:-1]
                
                index_daily_pnl=index_size_df.shift(1)*index_df
                index_acct_curve=index_daily_pnl.cumsum()
                index_ind_return=index_acct_curve.iloc[-1]
                index_cache=(index_daily_pnl,index_acct_curve,index_size_df,index_ind_return)
        else:
            index_cache=(None,None,None,None)
            
        '''Finally put everything together'''    
        portfolio_df=pd.concat([alpha_df,index_df],axis=1)
            
        portfolio_size_df=pd.concat([alpha_cache[2],index_cache[2]],axis=1).sort_index()
  
        portfolio_daily_pnl=pd.concat([alpha_cache[0],index_cache[0]],axis=1).sort_index()

        portfolio_acct_curve=portfolio_daily_pnl.cumsum().ffill().sum(axis=1)
        portfolio_ind_return=alpha_cache[3].copy()
        
        portfolio_gross=np.abs(portfolio_size_df).sum(axis=1).sort_index()
        portfolio_turnover=(np.abs(alpha_size_df.fillna(0.0).diff(1)).sum().sum())/(portfolio_size_df.shape[0]/260)
        
        portfolio_cache=(portfolio_daily_pnl,portfolio_acct_curve,portfolio_size_df,portfolio_ind_return,portfolio_gross,\
                         portfolio_turnover,portfolio_df)
        
        self.portfolio_account=portfolio_cache #save for later use
        
        
        return long_cache,short_cache,alpha_cache,portfolio_cache
    
    def plot_account(self,title,figsize=[10,4],portfolio=None):
        '''
        Plot the account curve
        '''
        if portfolio is None:
            try:
                portfolio_cache=self.portfolio_account

            except AttributeError:
                print("Execute the signal_account first!")  
                return None
        else:
            portfolio_cache=portfolio
        
        plot_signal(title,figsize,portfolio_cache)
        
    def review(self):
        ''' 
        Export the data for review
        '''
        if self.index_df.shape[1]==1:
            long_df_adj=self.long_df.copy().sub(self.index_df.loc[self.long_df.index].iloc[:,0],axis=0)
            short_df_adj=(-self.short_df.copy()).add(self.index_df.loc[self.long_df.index].iloc[:,0],axis=0)
            
            short_review=((1+short_df_adj).cumprod()-1).ffill().iloc[-1].to_frame()
            short_review["size"]=self.short_cache[2].apply(lambda x:x.dropna().iloc[0],axis=0)*(self.capital)*1000
            
        else:
            long_df_adj=self.long_df.copy().apply(lambda x: x.sub(self.index_df.\
                                                                  loc[self.long_df.index,\
                                                                      Asia_mapping.loc[x.name[0][-2:]].iloc[0]]),axis=0)
            short_review=None
                                                  
                                                  
        
        long_review=((1+long_df_adj).cumprod()-1).ffill().iloc[-1].to_frame()
        long_review["size"]=self.long_cache[2].apply(lambda x:x.dropna().iloc[0],axis=0)*(self.capital)*1000

        return long_review,short_review
        
        
        
        

In [62]:
def combine_signal(portfolio_list,capital_list):
    
    
    daily_pnl_list=[]
    signal_df_list=[]
    portfolio_size_list=[]
    for i in range(len(portfolio_list)):

        daily_pnl_list.append(portfolio_list[i][0]*capital_list[i]/np.sum(capital_list))
        signal_df_list.append(portfolio_list[i][-1]*capital_list[i]/np.sum(capital_list))
        portfolio_size_list.append(portfolio_list[i][2]*capital_list[i]/np.sum(capital_list))
    
    daily_pnl=pd.concat(daily_pnl_list,axis=1)
    signal_df=pd.concat(signal_df_list,axis=1)
    size_df=pd.concat(portfolio_size_list,axis=1)
    
    account_curve=daily_pnl.cumsum().ffill().sum(axis=1)
    ind_return=daily_pnl.cumsum().ffill().iloc[-1]

    gross=np.abs(size_df).sum(axis=1)
    turnover=(np.abs(size_df.fillna(0.0).diff(1)).sum().sum())/(size_df.shape[0]/260)
    
    portfolio_cache=(daily_pnl,account_curve,size_df,ind_return,gross,turnover,signal_df)
    return portfolio_cache


In [63]:
def sharpe_by_year(account_curve,year_list):
    result_dict={}
    for i in year_list:
        sub_curve=account_curve.iloc[(account_curve.index>=pd.Timestamp(i,1,1))&(account_curve.index<=pd.Timestamp(i,12,31))]
        result_dict[i]=trading_analytics_simp(sub_curve)[0]
    return result_dict

In [64]:
def dd_by_year(account_curve,year_list):
    result_dict={}
    for i in year_list:
        sub_curve=account_curve.iloc[(account_curve.index>=pd.Timestamp(i,1,1))&(account_curve.index<=pd.Timestamp(i,12,31))]
        result_dict[i]=trading_analytics_simp(sub_curve)[1]
    return result_dict

In [1]:
def quarter_pnl_df(pnl_series):
    
    pnl=pnl_series.to_frame()
    pnl.columns=["pnl"]  
    
    def quarter(month):
        if month in [1,2,3]:
            return "Q1"
        elif month in [4,5,6]:
            return "Q2"
        elif month in [7,8,9]:
            return "Q3"
        else:
            return "Q4"
    
    pnl["Quarter"]=pnl.apply(lambda x:str((x.name).year)+quarter((x.name).month),axis=1)
    
    quarter_first_dict={}
    quarter_second_dict={}
    for i in pnl["Quarter"].unique():
        first=pnl[pnl["Quarter"]==i].iloc[:30]["pnl"]
        second=pnl[pnl["Quarter"]==i].iloc[30:]["pnl"]
        quarter_first_dict[i]=first.iloc[-1]-first.iloc[0]
        quarter_second_dict[i]=second.iloc[-1]-second.iloc[0]

    quarter_df=pd.Series(quarter_first_dict).to_frame()
    quarter_df.columns=["first_half"]
    quarter_df["second_half"]=quarter_second_dict.values()
    
    return quarter_df

In [2]:
def Asia_split_portfolio(portfolio_cache):
    ''' 
    country separation analysis
    '''
    country_dict={}
    
    pnl_df=portfolio_cache[0].iloc[:,:-6].copy()
    size_df=portfolio_cache[2].iloc[:,:-6].copy()
    portfolio_df=portfolio_cache[-1].iloc[:,:-6].copy()
    
    country_row=pnl_df.apply(lambda x:x.name[0][-2:],axis=0)
    unique_country=country_row.unique()
    
    for i in unique_country:
        sub_pnl_df=pd.concat([pnl_df.loc[:,country_row==i],portfolio_cache[0][Asia_mapping.loc[i].iloc[0]]],axis=1)
        sub_acct_curve=sub_pnl_df.cumsum().ffill().sum(axis=1)
    
        sub_size_df=pd.concat([size_df.loc[:,country_row==i],portfolio_cache[2][Asia_mapping.loc[i].iloc[0]]],axis=1)
        sub_ind_return=pnl_df.loc[:,country_row==i].cumsum().ffill().iloc[-1].dropna()
        
        sub_gross=np.abs(sub_size_df).sum(axis=1).sort_index()
        sub_turnover=(np.abs(size_df.loc[:,country_row==i].fillna(0.0).diff(1)).sum().sum())/\
        (size_df.loc[:,country_row==i].shape[0]/260)        
        
        sub_portfolio_df=pd.concat([portfolio_df.loc[:,country_row==i],portfolio_cache[-1][Asia_mapping.loc[i].iloc[0]]],axis=1)
        
        country_dict[i]=[sub_pnl_df,sub_acct_curve,sub_size_df,sub_ind_return,sub_gross,sub_turnover,sub_portfolio_df]
    return country_dict