In [1]:
from common import *

  return func(obj, *args, **kwargs)


In [2]:
import pandas as pd
N_TRAIN, N_TEST = 623817, 640430

In [2]:
# generate hdf5 from original files
if __name__ == '__main__':
    from decimal import Decimal
    df = pd.read_csv('train.csv', index_col=0, dtype={'y': str})
    df.index = df.index - N_TRAIN
    df.y = (df.y.map(Decimal) * 10000).astype(float)
    df.columns.values[-1] = 'w'
    df.to_hdf(top_dir + '/data/given/train.hdf5', mode='w', key='df')
    dg = pd.read_csv('test.csv', index_col=0)
    dg.to_hdf(top_dir + '/data/given/test.hdf5', mode='w', key='dg')

In [None]:
try:
    df = pd.read_hdf(top_dir + 'data/given/train.hdf5')
    dg = pd.read_hdf(top_dir + 'data/given/test.hdf5')
except FileNotFoundError:
    if __name__ != '__main__':
        raise
    df = pd.read_csv(top_dir + 'data/given/train.csv', index_col=0)
    df.index = df.index - N_TRAIN
    df.y *= 10000
    dg = pd.read_csv(top_dir + 'data/given/test.csv', index_col=0)
dh = pd.concat([df, dg])
dh.columns.name = 'Feature'
dh['x3a'] = dh['x3A x3B x3C x3D x3E'.split()].mean(axis=1)
dh['x3g'] = dh['x3A x3B x3C x3D x3E'.split()].pipe(np.log).mean(axis=1).pipe(np.exp)
dh['wy'] = dh.w * dh.y
dh['Index'] = dh.index
dh['Count'] = 1
df, dg = dh.loc[:-1], dh.loc[0:]

In [None]:
def make_stock_wide(long):
    orig_index, orig_type = long.index, type(long)
    ref = dh.loc[long.index]
    try:
        long.index = pd.MultiIndex.from_arrays([ref.Day, ref.Stock])
        wide = long.unstack() # unstacked level is autosorted
        # # changed my mind, no swapping levels for columns to save time/space
        #if issubclass(orig_type, pd.DataFrame):
        #    wide = wide.swaplevel(axis=1)
    finally:
        long.index = orig_index
    return wide
makesw = make_stock_wide
pd.DataFrame.makesw = makesw
pd.Series.makesw = makesw

def make_index_long_like(wide, ref=dh):
    if isinstance(wide, pd.Series) and isinstance(wide.index, pd.MultiIndex):
        raise ValueError('TODO support hierarchical wide index')
    if isinstance(wide, pd.Series):
        return ref.Stock.map(wide).rename(wide.name)
    if isinstance(wide, pd.DataFrame):
        long = wide.T.unstack().T
        idcs = ref.Index.makesw().T.unstack().T.rename('Index')
        #assert (stacked.index == idcs.index).all()
        long.index = idcs
        long = long[~long.index.isna()]
        long.index = long.index.astype(np.int64)
        long.sort_index(inplace=True)
        #assert (long.index == ref.index).all()
        return long
makeil_like = make_index_long_like
pd.DataFrame.makeil_like = makeil_like
pd.Series.makeil_like = makeil_like

In [13]:
def index_n_valid_days_filter(dh, n_valid=None):
    if n_valid is None:
        dh, n_valid = globals()['dh'], dh
    return dh.groupby('Stock').y.transform(lambda x: x.count() >= n_valid).rename('Valid')
index_nvdf = index_n_valid_days_filter

def stock_n_valid_days_filter(dh, n_valid):
    if n_valid is None:
        dh, n_valid = globals()['dh'], dh
    return dh.groupby('Stock').y.agg(lambda x: x.count() >= n_valid).rename('Valid')
stock_nvdf = stock_n_valid_days_filter

def day_filter(df, n_valid, stock='Stock'):
    '''Params - df: `DataFrame` with a 'Stock' and 'y' column,
                n_valid: minimum number of valid entries to keep a stock,
                stock: just 'Stock', or the `Series` containing the groupby value
    Filters `df` for only stock entries that mean minimum `n_valid` observations in the `y` variable'''
    return df.groupby('Stock').filter(lambda x: x.y.count() >= n_valid)

dff = day_filter(df, 246)
dhh = day_filter(dh, 1)
dhf = day_filter(dh, 246)

In [64]:
if __name__ == '__main__':
    print(dh.index[0])
    print(set(df.Stock.unique()) ^ set(range(3023)))
    print(set(dh.index) - set(dhh.index))
dfh = dh.loc[list(range(dh.index[0], 0)) + [43289]]
dfh.index = list(dfh.index[:-1]) + [-623818]
dfh.sort_index(inplace=True)
dfh.iloc[0, 0] = -1
dfh.iloc[0, 1] = 1
dfh.iloc[0, 2] = 1917
for j in range(4, 15):
    dfh.iloc[0, j] = np.nan
dfh.iloc[0, 17] = -623818
dfh.iloc[0, 18] = 0

-623817
{1917}
{43289}


In [5]:
N_TRAIN, N_TEST = len(df), len(dg) # should be 623817, 640430
N_STOCK = dh.Stock.nunique() # should be 3023

In [6]:
xfeats = 'x0 x1 x2 x3A x3B x3C x3D x3E x4 x5 x6'.split()
xofeats = 'x0 x1 x2 x3A x3B x3C x3D x3E x4 x5 x6 x3a x3g'.split()
x_feats = 'x0 x1 x2 x3a x4 x5 x6'.split()
xxfeats = 'x0 x1 x2 x3a x4 x5 x3g'.split()
xafeats = 'x0 x1 x2 x3a x4 x5'.split()
xgfeats = 'x0 x1 x2 x3g x4 x5'.split()
x3feats = 'x3A x3B x3C x3D x3E'.split()
idfeats = 'Market Stock'.split()

In [None]:
stockmarket = dh.groupby('Stock').Market.first()
stockimportance = (dh.y ** 2 * dh.w).groupby(dh.Stock).sum()

### Matrix stuff

In [2]:
def def_W(namespace, dh=dh, include=''):
    '''Params - namespace: e.g. `globals()`, dh: `dh` or sub-DataFrame of it (`dh` itself is recommended)
    Put convenience matrix variables into `namespace`'''
    include_list = include.split()
    o = O()
    o.W = dh.w.pipe(makesw)
    o.Wa = o.W.fillna(0)
    if 'nrm' in include_list:
        o.Wnrm = o.W.pipe(lambda w: w / w.sum())
        o.Wnrma = o.Wnrm.fillna(0)
    o._standardize = lambda X: (X - X.wmean(o.W)) / np.sqrt(X.wvar(o.W))
    o._dewmean = lambda X: X - X.wmean(o.W)
    o._scale = lambda X: X / np.sqrt(X.wvar(o.W))
    dict.update(namespace, **o)

In [None]:
def def_Y(namespace, dh=dh, include=''):
    include_list = include.split()
    o = O(W=namespace['W'])
    o.Y = dh.y.pipe(makesw)
    o.Ya = o.Y.fillna(0)
    o.Y_wmean = o.Y.wmean(o.W).rename('y$wmean')
    o.Y_wvar = o.Y.wvar(o.W).rename('y$wvar')
    o.Y_wvar = (o.Y_wvar.fillna((o.Y ** 2).sum().chain.fillna(1, inplace=True))).rename('y$wvar.a')
    o.Y_wzvar = ((o.Y ** 2 * o.W).sum() / o.W.sum()).chain.fillna(1, inplace=True).rename('y$zvar.a')
    if 'scl' in include_list:
        o.Yscl = o.Y / np.sqrt(o.Y_wzvar)
        o.Wscl = o.W * o.Y_wzvar
    if 'std' in include_list:
        o.Ystd = (o.Y - o.Y_wmean) / np.sqrt(o.Y_wvar)
        o.Ystda = o.Ystd.fillna(0)
        o.Wstd = o.W * o.Y_wvar
    dict.update(namespace, **o)

In [None]:
def def_X(namespace, dh=dh, include='', feats=None):
    include_list = include.split()
    feats = feats.split() if isinstance(feats, str) else feats
    dhfeats = dh[[c for c in dh.columns if c[0] == 'x']] if feats is None else dh[feats]
    o = O(W=namespace['W'], Wa=namespace['Wa'])
    o.X = dhfeats.pipe(makesw)
    o.Xa = o.X.fillna(0)
    if 'log' in include_list or 'logs' in include_list:
        o.Xlog = o.X.pipe(np.log).colname_append('.log', level=0)
        o.Xlogz0 = o.Xlog.replace(-np.inf, np.nan)
        o.Xiszero = (o.Xlog == -np.inf).astype(np.int8)
        o.Xlogz1 = (o.Xiszero * (o.Xlogz0.min() - 1) + o.Xlog.replace(-np.inf, 0)).colname_append('.z(1)', level=0)
        o.Xlogz3 = (o.Xiszero * -93 + o.Xlog.replace(-np.inf, 0)).colname_append('.zc(-93)', level=0)
        o.Xlogz4 = (o.Xiszero * -194 + o.Xlog.replace(-np.inf, 0)).colname_append('.zc(-93)', level=0)
        if 'logs' in include_list:
            if 'z1' in include_list:
                o.Xlogz1_wmed = o.Xlogz1.wmedian(o.W.repeat_like(o.Xlogz1))
                o.Xlogz1_wmean = o.Xlogz1.wmean(o.W.repeat_like(o.Xlogz1))
                if 355 in o.Xlogz1.index:
                    o.Xlogz1_wmed[o.Xlogz1_wmed.isna()] = o.Xlogz1.loc[355, (ss, 1917)]
                    o.Xlogz1_wmean[o.Xlogz1_wmean.isna()] = o.Xlogz1.loc[355, (ss, 1917)]
                _temp = o.Xlogz1.wvar(o.W.repeat_like(o.Xlogz1)) + 1
                o.Xlogz1_wvar = _temp.chain.replace(np.inf, 5, inplace=True).chain.fillna(5, inplace=True) #"TODO"
                o.Xlogz1s = (o.Xlogz1 - o.Xlogz1_wmean / np.sqrt(o.Xlogz1_wvar)).colname_append('.std', level=0)
                o.Xlogz1sa = o.Xlogz1s.fillna(0).colname_append('.a', level=0)
                o.Xlogz1ms = (o.Xlogz1 - o.Xlogz1_wmed / np.sqrt(o.Xlogz1_wvar)).colname_append('.mstd', level=0)
                o.Xlogz1msa = o.Xlogz1ms.fillna(0).colname_append('.a', level=0)
            if 'z3' in include_list:
                o.Xlogz3_wmed = o.Xlogz3.wmedian(o.W.repeat_like(o.Xlogz3))
                o.Xlogz3_wmean = o.Xlogz3.wmean(o.W.repeat_like(o.Xlogz3))
                if 355 in o.Xlogz3.index:
                    o.Xlogz3_wmed[o.Xlogz3_wmed.isna()] = o.Xlogz3.loc[355, (ss, 1917)]
                    o.Xlogz3_wmean[o.Xlogz3_wmean.isna()] = o.Xlogz3.loc[355, (ss, 1917)]
                _temp = o.Xlogz3.wvar(o.W.repeat_like(o.Xlogz3)) + 1
                o.Xlogz3_wvar = _temp.chain.replace(np.inf, 5, inplace=True).chain.fillna(5, inplace=True) #"TODO"
                o.Xlogz3s = (o.Xlogz3 - o.Xlogz3_wmean / np.sqrt(o.Xlogz3_wvar)).colname_append('.std', level=0)
                o.Xlogz3sa = o.Xlogz3s.fillna(0).colname_append('.a', level=0)
                o.Xlogz3ms = (o.Xlogz3 - o.Xlogz3_wmed / np.sqrt(o.Xlogz3_wvar)).colname_append('.mstd', level=0)
                o.Xlogz3msa = o.Xlogz3ms.fillna(0).colname_append('.a', level=0)
            if 'z4' in include_list:
                o.Xlogz4_wmed = o.Xlogz4.wmedian(o.W.repeat_like(o.Xlogz4))
                o.Xlogz4_wmean = o.Xlogz4.wmean(o.W.repeat_like(o.Xlogz4))
                if 355 in o.Xlogz4.index:
                    o.Xlogz4_wmed[o.Xlogz4_wmed.isna()] = o.Xlogz4.loc[355, (ss, 1917)]
                    o.Xlogz4_wmean[o.Xlogz4_wmean.isna()] = o.Xlogz4.loc[355, (ss, 1917)]
                _temp = o.Xlogz4.wvar(o.W.repeat_like(o.Xlogz4)) + 1
                o.Xlogz4_wvar = _temp.chain.replace(np.inf, 5, inplace=True).chain.fillna(5, inplace=True) #"TODO"
                o.Xlogz4s = (o.Xlogz4 - o.Xlogz4_wmean / np.sqrt(o.Xlogz4_wvar)).colname_append('.std', level=0)
                o.Xlogz4sa = o.Xlogz4s.fillna(0).colname_append('.a', level=0)
                o.Xlogz4ms = (o.Xlogz4 - o.Xlogz4_wmed / np.sqrt(o.Xlogz4_wvar)).colname_append('.mstd', level=0)
                o.Xlogz4msa = o.Xlogz4ms.fillna(0).colname_append('.a', level=0)
    if 'wqtl' in include_list or 'wnrm' in include_list:
        o.Xwqtl = o.X.wqtl(o.Wa)
        if 'wnrm' in include_list:
            from scipy.stats import norm
            o.Xwnrm = (o.Xwqtl.pipe(norm.ppf) + o.Xwqtl * 0).colname_append('.wnrm', level=0)
            o.Xwnrma = o.Xwnrm.fillna(0).colname_append('.a', level=0)
        o.Xwqtl = o.Xwqtl.colname_append('.wqtl', level=0)
        o.Xwqtla = o.Xwqtl.fillna(0.5).colname_append('.a', level=0)
    dict.update(namespace, **o)

### misc

In [8]:
def def_misc(namespace, dh=dh):
    o = O(Y=namespace['Y'], W=namespace['W'])
    o.stock_wmean = Y.wmean(W)
    o.index_importance = dh.w * dh.y ** 2
    o.stock_importance = (W * Y ** 2).sum()
    o.stock_importance__wmean = (W * (Y - Y.wmean(W)) ** 2).sum()
    dict.update(namespace, **o)