In [1]:
from common import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [3]:
get_M = lambda: pd.read_pickle(the_data/'given/M.pkl')
get_N = lambda: pd.read_pickle(the_data/'given/N.pkl')

In [4]:
M = get_M()

#### very important ID assignment code

In [5]:
class IdAssign:
    class NO_MISSING(metaclass=staticclass): # special filler value because `None` might actually be meaningfully used
        pass
    def __init__(self, init, missing, name):
        self.name = name
        self.map = {}
        if missing is not __class__.NO_MISSING:
            self.map[missing] = -1
        i = 0
        for x in init:
            if x not in self.map and x != missing:
                self.map[x] = i
                i += 1
        self.cache = None
    def __call__(self, key):
        if key not in self.map:
            self.map[key] = len(self.map)
        return self.map[key]
    def __len__(self):
        return self.map.__len__()
    @property
    def series(self):
        if self.cache==len(self):
            return self._series
        # Python 3.?+ guarantees that dict keys and values are itered in same order, and that order is insertion order
        self._series = pd.Series(list(self.map.keys()), index=list(self.map.values()), name=self.name)
        self._series.index.name = self.name+'Id'
        self.cache = len(self)
        return self._series

assetCodeSeries = pd.Series(M.assetCode.unique())
assetCodeIdAssign = IdAssign(assetCodeSeries, missing='', name='assetCode')
del assetCodeSeries
assetNameSeries = pd.Series(M.assetName.unique())
assetNameIdAssign = IdAssign(assetNameSeries, missing='Unknown', name='assetName')
del assetNameSeries

In [6]:
M['assetCodeId'] = M.assetCode.map(assetCodeIdAssign).astype(int)
M['assetNameId'] = M.assetName.map(assetNameIdAssign).astype(int)
N = get_N()
N['assetNameId'] = N.assetName.map(assetNameIdAssign) # set up assetNameIdAssign

In [7]:
def makeMultimap(fr, to):
    ret = {}
    for f, t in zip(fr, to):
        ret.setdefault(f, set()).add(t)
    return ret
assetNameMapCodes = makeMultimap(M.assetNameId, M.assetCodeId)

def makeSinglemap(fr, to):
    ret = {}
    for f, t in zip(fr, to):
        if f in ret:
            assert t==ret[f], 'a "from" element must be mapped to a unique "to" element!'
        else:
            ret[f] = t
    return ret
assetCodeMapName = makeSinglemap(M.assetCodeId, M.assetNameId)

In [None]:
headlineTagSeries = pd.Series(N.headlineTag.unique())
headlineTagIdAssign = IdAssign(headlineTagSeries, missing='', name='headlineTag')
del headlineTagSeries

In [None]:
del N;

In [None]:
# convenience functions, only for interactive use, no scripting/inside loop please because very unoptimized
from collections import Iterable
def aId2Code(a):
    if hasattr(a, 'map'):
        return a.map(assetCodeIdAssign.series)
    elif isinstance(a, Iterable) and not isinstance(a, str):
        assetCodeMap = assetCodeIdAssign.series.to_dict()
        return type(a)(map(lambda x: assetCodeMap[x], a))
    else:
        return assetCodeIdAssign.series.loc[a]
def aId2Name(a):
    if hasattr(a, 'map'):
        return a.map(assetNameIdAssign.series)
    elif isinstance(a, Iterable) and not isinstance(a, str):
        assetNameMap = assetNameIdAssign.series.to_dict()
        return type(a)(map(lambda x: assetNameMap[x], a))
    else:
        return assetNameIdAssign.series.loc[a]
def aCodeId2Name(a):
    if hasattr(a, 'map'):
        return a.map(assetCodeMapName).map(assetNameIdAssign.series)
    elif isinstance(a, Iterable) and not isinstance(a, str):
        assetNameMap = assetNameIdAssign.series.to_dict()
        return type(a)(map(lambda x: assetNameMap[assetCodeMapName[x]], a))
    else:
        return assetNameIdAssign.series.loc[assetCodeMapName[a]]

#### some data specific helper func/structs

In [None]:
stringify_columns = lambda f: '["'+'","'.join(c for c in f)+'"]'

returns_columns = {
    'returnsClosePrevRaw1':'cc', 'returnsOpenPrevRaw1':'oo',
    'returnsClosePrevMktres1':'acc', 'returnsOpenPrevMktres1':'aoo',
    'returnsClosePrevRaw10':'ccTEN','returnsOpenPrevRaw10':'ooTEN',
    'returnsClosePrevMktres10':'accTEN','returnsOpenPrevMktres10':'aooTEN'
}
columns_for_U = set(returns_columns.values()) | set(['open', 'close', 'volume'])
excluded_columns = [
    'time','assetCode','assetName','universe','returnsOpenNextMktres10','quarter','y'
]
exclusion_filter = lambda c: c not in excluded_columns
object_columns = ['assetCode', 'assetName']
enumeration_columns = ['assetCodeId', 'assetNameId']

#### hard coded constants

In [None]:
train_start_time = pd.Timestamp('2009-01-01',tz='UTC')
lookback = 60
shortterm = 21
longterm = 250 + lookback*2 #idk it's hard to reason about how much I _really_ need, so this should be pretty safe

#### setting up only the stocks that we have seen in-universe

In [None]:
inUniCount = M[M.time>=train_start_time].groupby('assetCodeId').universe.sum()
stocksInUni = (inUniCount != 0).pipe(lambda x: x.index[x])
assert stocksInUni.is_monotonic
xStocksInUni = set(stocksInUni) #EDITCELL
inUniCountSeries = inUniCount.astype(float) #EDITCELL
stocksAlways = (inUniCount >= 1981).pipe(lambda x: x.index[x])
xStocksAlways = set(stocksAlways)

#### set up everything

In [4]:
def set_basic_features(*,M,delete=True):
    M['assetCodeId'] = M.assetCode.map(assetCodeIdAssign)#.astype(int) # much more efficient to process everything as float
    if delete:
        del M['assetCode']
    M['assetNameId'] = M.assetName.map(assetNameIdAssign)#.astype(int)
    if delete:
        del M['assetName']
    for orig_col, new_col in returns_columns.items():
        M[new_col] = np.log1p(M[orig_col])
        if delete:
            del M[orig_col]
    # time features
    M['dayOfYear'] = M.time.dt.dayofyear.astype(float)
    M['dayOfWeek'] = M.time.dt.dayofweek.astype(float)
set_basic_features(M=M,delete=False)

#### P setup

In [None]:
def setup_P(*, F, P, vp=True, always=True):
    P['target'] = P.y>0
    P['upDown'] = (P.target*2-1)
    P['absVal'] = np.abs(P.y)
    P['flat_weight'] = P.absVal
    if vp:
        P['vp1_weight'] = P.absVal*F.vp1/1e9
        P['vp5_weight'] = P.absVal*F.vp5/1e9
        P['vp10_weight'] = P.absVal*F.vp10/1e9
        P['vp20_weight'] = P.absVal*F.vp20/1e9
    if always:
        P['always_weight'] = P.absVal*F.assetCodeId.isin(xStocksAlways)