In [1]:
from common import *
import globals as top_imports

In [2]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [3]:
get_M = lambda: pd.read_pickle(the_data/'given/M.pkl')
get_N = lambda: pd.read_pickle(the_data/'given/N.pkl')

In [4]:
if top_imports.use_M:
    M = get_M()

#### very important ID assignment code

In [5]:
if top_imports.use_M:
    class IdAssign:
        class NO_MISSING(metaclass=staticclass): # special filler value because `None` might actually be meaningfully used
            pass
        def __init__(self, init, missing, name):
            self.name = name
            self.map = {}
            if missing is not __class__.NO_MISSING:
                self.map[missing] = -1
            i = 0
            for x in init:
                if x not in self.map and x != missing:
                    self.map[x] = i
                    i += 1
            self.cache = None
        def __call__(self, key):
            if key not in self.map:
                self.map[key] = len(self.map)
            return self.map[key]
        def __len__(self):
            return self.map.__len__()
        @property
        def series(self):
            if self.cache==len(self):
                return self._series
            # Python 3.?+ guarantees that dict keys and values are itered in same order, and that order is insertion order
            self._series = pd.Series(list(self.map.keys()), index=list(self.map.values()), name=self.name)
            self._series.index.name = self.name+'Id'
            self.cache = len(self)
            return self._series

    assetCodeSeries = pd.Series(M.assetCode.unique())
    assetCodeIdAssign = IdAssign(assetCodeSeries, missing='', name='assetCode')
    del assetCodeSeries
    assetNameSeries = pd.Series(M.assetName.unique())
    assetNameIdAssign = IdAssign(assetNameSeries, missing='Unknown', name='assetName')
    del assetNameSeries

In [6]:
if top_imports.use_M:
    M['assetCodeId'] = M.assetCode.map(assetCodeIdAssign).astype(int)
    M['assetNameId'] = M.assetName.map(assetNameIdAssign).astype(int)
    if top_imports.use_N:
        N = get_N()
        N['assetNameId'] = N.assetName.map(assetNameIdAssign) # set up assetNameIdAssign

In [7]:
if top_imports.use_N:
    def makeMultimap(fr, to):
        ret = {}
        for f, t in zip(fr, to):
            ret.setdefault(f, set()).add(t)
        return ret
    assetNameMapCodes = makeMultimap(M.assetNameId, M.assetCodeId)

    def makeSinglemap(fr, to):
        ret = {}
        for f, t in zip(fr, to):
            if f in ret:
                assert t==ret[f], 'a "from" element must be mapped to a unique "to" element!'
            else:
                ret[f] = t
        return ret
    assetCodeMapName = makeSinglemap(M.assetCodeId, M.assetNameId)

In [None]:
if top_imports.use_N:
    headlineTagSeries = pd.Series(N.headlineTag.unique())
    headlineTagIdAssign = IdAssign(headlineTagSeries, missing='', name='headlineTag')
    del headlineTagSeries

In [None]:
if top_imports.use_N:
    del N;

In [None]:
# convenience functions, only for interactive use, no scripting/inside loop please because very unoptimized

if top_imports.use_M:
    from collections import Iterable

    def aId2Code(a):
        if hasattr(a, 'map'):
            return a.map(assetCodeIdAssign.series)
        elif isinstance(a, Iterable) and not isinstance(a, str):
            assetCodeMap = assetCodeIdAssign.series.to_dict()
            return type(a)(map(lambda x: assetCodeMap[x], a))
        else:
            return assetCodeIdAssign.series.loc[a]

    if top_imports.use_N:
        def aId2Name(a):
            if hasattr(a, 'map'):
                return a.map(assetNameIdAssign.series)
            elif isinstance(a, Iterable) and not isinstance(a, str):
                assetNameMap = assetNameIdAssign.series.to_dict()
                return type(a)(map(lambda x: assetNameMap[x], a))
            else:
                return assetNameIdAssign.series.loc[a]
        def aCodeId2Name(a):
            if hasattr(a, 'map'):
                return a.map(assetCodeMapName).map(assetNameIdAssign.series)
            elif isinstance(a, Iterable) and not isinstance(a, str):
                assetNameMap = assetNameIdAssign.series.to_dict()
                return type(a)(map(lambda x: assetNameMap[assetCodeMapName[x]], a))
            else:
                return assetNameIdAssign.series.loc[assetCodeMapName[a]]

#### some data specific helper func/structs

In [None]:
stringify_columns = lambda f: '["'+'","'.join(c for c in f)+'"]'

returns_columns = {
    'returnsClosePrevRaw1':'cc', 'returnsOpenPrevRaw1':'oo',
    'returnsClosePrevMktres1':'acc', 'returnsOpenPrevMktres1':'aoo',
    'returnsClosePrevRaw10':'ccTEN','returnsOpenPrevRaw10':'ooTEN',
    'returnsClosePrevMktres10':'accTEN','returnsOpenPrevMktres10':'aooTEN'
}
columns_for_U = set(returns_columns.values()) | set(['open', 'close', 'volume'])
excluded_columns = [
    'time','assetCode','assetName','universe','returnsOpenNextMktres10','quarter','y'
]
exclusion_filter = lambda c: c not in excluded_columns
object_columns = ['assetCode', 'assetName']
enumeration_columns = ['assetCodeId', 'assetNameId']

#### hard coded constants

In [None]:
train_start_time = pd.Timestamp('2009-01-01',tz='UTC')
lookback = 60
shortterm = 21
longterm = 250 + lookback*2 #idk it's hard to reason about how much I _really_ need, so this should be pretty safe

#### setting up only the stocks that we have seen in-universe

In [None]:
if top_imports.use_M:
    inUniCount = M[M.time>=train_start_time].groupby('assetCodeId').universe.sum()
    stocksInUni = (inUniCount != 0).pipe(lambda x: x.index[x])
    assert stocksInUni.is_monotonic
    xStocksInUni = set(stocksInUni) #EDITCELL
    inUniCountSeries = inUniCount.astype(float) #EDITCELL
    stocksAlways = (inUniCount >= 1981).pipe(lambda x: x.index[x])
    xStocksAlways = set(stocksAlways)

#### set up everything

In [4]:
def set_basic_features(*,M,delete=True):
    M['assetCodeId'] = M.assetCode.map(assetCodeIdAssign)#.astype(int) # much more efficient to process everything as float
    if delete:
        del M['assetCode']
    M['assetNameId'] = M.assetName.map(assetNameIdAssign)#.astype(int)
    if delete:
        del M['assetName']
    for orig_col, new_col in returns_columns.items():
        M[new_col] = np.log1p(M[orig_col])
        if delete:
            del M[orig_col]
    # time features
    M['dayOfYear'] = M.time.dt.dayofyear.astype(float)
    M['dayOfWeek'] = M.time.dt.dayofweek.astype(float)

if top_imports.use_M:
    set_basic_features(M=M,delete=False)

#### P setup

In [None]:
def setup_P(*, F, P, vp=True, always=True):
    P['quarter'] = P.time.dt.year+(P.time.dt.quarter-1)/4
    P['target'] = P.y>0
    P['upDown'] = (P.target*2-1)
    P['absVal'] = np.abs(P.y)
    P['flat_weight'] = P.absVal
    if vp:
        P['vp1_weight'] = P.absVal*F.vp1/1e9
        P['vp5_weight'] = P.absVal*F.vp5/1e9
        P['vp10_weight'] = P.absVal*F.vp10/1e9
        P['vp20_weight'] = P.absVal*F.vp20/1e9
    if always:
        P['always_weight'] = P.absVal*F.assetCodeId.isin(xStocksAlways)

#### pairs setup

In [None]:
from helper.bisect import bisect_left as binary_search

@njit
def index_unique_pairs(A, B, values, repeat):
    assert len(A)==len(B)
    seen = [0] * len(values)
    ans = []
    for i, (a, b) in enumerate(zip(A, B)):
        ia = binary_search(values, a); ib = binary_search(values, b);
        if repeat==-1 or (seen[ia]<repeat and seen[ib]<repeat):
            seen[ia] += 1; seen[ib] += 1;
            ans.append(i)
    return ans

def make_unique_pairs(*, Dl, repeat=-1):
    if repeat==-1:
        return Dl
    ii = index_unique_pairs(Dl[0].values, Dl[1].values, sorted(set(Dl[0].values)|set(Dl[1].values)), repeat=repeat)
    return Dl.iloc[ii]

In [None]:
def make_CC_G_Q_from_Dl_F_P_W(*, Dl, F, P, W, lo, hi=None, repeat=-1,
                              roll_corr=list(product(['ay1','ay10','ay20'],[10,21,62,250]))):
    SEE(f'(pair) len(F.columns) = {len(F.columns)}')
    
    D = Dl
    allTime = pd.Series(P.time.unique())
    allTime.index = allTime
    
    # |begin| subset the pairs list
    if hi is None:
        hi = (1., 9999)
    D = D[(lo[0]<=D.Corr)&(D.Corr<=hi[0])&(lo[1]<=D.Unic)&(D.Unic<=hi[1])]
    if isinstance(repeat, int):
        D = D.pipe(lambda x: make_unique_pairs(Dl=x, repeat=repeat))
        w = (D.Corr.fillna(0)*0 + 1.).values
    elif repeat=='prop':
        D01 = D
        D10 = D[[1,0,'Corr','Unic']]
        D2 = pd.concat([D01,D10], axis=0)
        aw = D2.groupby(0).Corr.sum()
        w = (D.Corr / np.maximum(D[0].map(aw), D[1].map(aw))).values
        w /= w.mean()
    else:
        assert False
    hgt = sum(p[-1] for p in D.itertuples())
    CC = np.zeros((hgt,2+len(roll_corr)))
    G0, G1 = np.zeros((hgt,len(F.columns))), np.zeros((hgt,len(F.columns)))
    Q0, Q1 = [], []
    # |end| subset the pairs list
    
    i = 0
    SEE(f'(pair) generating [{len(D)}]', end='')
    for ii, ((_,a0,a1,corr,unic), weight) in enumerate(zip(D.itertuples(),w)):
        print_progress(ii, dot=10, print=SEE)
        assert len(weight.shape)==0
        in0, in1 = (F.assetCodeId==a0)&(P.universe!=0), (F.assetCodeId==a1)&(P.universe!=0)
        A0, A1 = F[in0], F[in1]
        B0, B1 = P[in0], P[in1]
        inTime = set(B0.time)&set(B1.time)
        tm0, tm1 = B0.time.isin(inTime), B1.time.isin(inTime)
        A0, A1 = A0[tm0], A1[tm1]
        B0, B1 = B0[tm0], B1[tm1]
        
        tm = allTime.isin(inTime).values
        def iter_roll_corr():
            for y, r in roll_corr:
                yield W[y][a0].rolling(window=r, min_periods=5).corr(W[y][a1])[tm] #TODO hard coded min_periods
        unic_ = [np.ones_like(W[next(iter(W))].iloc[:,0][tm])*unic]
        weight_ = [np.ones_like(W[next(iter(W))].iloc[:,0][tm])*weight]
        C = np.stack(chain(iter_roll_corr(),unic_,weight_), axis=1)
        
        G0[i:i+unic], G1[i:i+unic] = A0.values, A1.values
        Q0.append(B0); Q1.append(B1);
        CC[i:i+unic] = C
        
        i += unic
    SEE()
    
    Fcols = F.columns
    del Dl, F, P, W; gc.collect()
        
    Q0, Q1 = pd.concat(Q0, axis=0), pd.concat(Q1, axis=0)
    G0, G1 = pd.DataFrame(G0, index=Q0.index, columns=Fcols), pd.DataFrame(G1, index=Q1.index, columns=Fcols)
    CC = pd.DataFrame(CC, columns=[f'__corr__{y}_{r}' for y,r in roll_corr]+['bothInUniCount','weight'])
    return CC, (G0,G1), (Q0,Q1)

In [None]:
def make_FG_PQ_from_CC_G_Q(copy_filter, diff_filter, plus_filter, corr_filter, *, CC, G, Q):
    SEE('(pair) putting together...', end=' ')
    
    weight = CC['weight'].values
    
    Gcols, Qcols = G[0].columns, Q[0].columns
    assert (Gcols==G[1].columns).all() and (Qcols==Q[1].columns).all()
    
    
    cop, dif, pls = list(filter(copy_filter, Gcols)), list(filter(diff_filter, Gcols)), list(filter(plus_filter, Gcols))
    G0, G1 = G[0][cop].values, G[1][cop].values
    H01 = G[0][dif].values - G[1][dif].values
    I01 = G[0][pls].values + G[1][pls].values
    CC = CC[list(filter(corr_filter, CC.columns))]
    
    F01, F10 = np.concatenate([G0,G1,H01,I01,CC.values], axis=1), np.concatenate([G1,G0,-H01,I01,CC.values], axis=1)
    FG = np.concatenate([F01, F10], axis=0)
    if not np.issubdtype(FG.dtype, np.float):
        FG = FG.astype(np.float32)
    
    FGcols = (['__0__'+c for c in cop] + ['__1__'+c for c in cop] + ['__0-1__'+c for c in dif]
              + ['__0+1__'+c for c in pls] + list(CC.columns))
    FG = pd.DataFrame(FG, columns=FGcols, copy=False)
    
    
    Q = Q[0].reset_index(drop=True), Q[1].reset_index(drop=True)
    assert(Q[0].time==Q[1].time).all()
    
    #TODO again we don't need universe here
    Q01 = pd.DataFrame(dict(time=Q[0].time, y=(Q[0].y-Q[1].y),
        **{f'__0__{c}': Q[0][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[0]},
        **{f'__1__{c}': Q[1][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[1]}))
    Q10 = pd.DataFrame(dict(time=Q[1].time, y=(Q[1].y-Q[0].y),
        **{f'__0__{c}': Q[1][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[1]},
        **{f'__1__{c}': Q[0][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[0]}))
    
    PQ = pd.concat([Q01, Q10], axis=0, ignore_index=True)
    PQ['pair_weight'] = np.concatenate([weight, weight], axis=0)
    
    PQ.sort_values('time', inplace=True)
    FG = FG.reindex(index=PQ.index, copy=False)
    
    FG.reset_index(drop=True, inplace=True)
    PQ.reset_index(drop=True, inplace=True)
    setup_P(F=None, P=PQ, vp=False, always=False)
    
    SEE('(pair) done')
    return FG, PQ