In [1]:
from common import *
import globals as top_imports

In [2]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [None]:
quarter = lambda time: (lambda time: (time.dt.year+(time.dt.quarter-1)/4).astype(np.float32))(pd.Series(time))
halfYear = lambda time: (quarter(time)*2).astype(int).values/2

In [3]:
get_M = lambda: pd.read_pickle(the_data/'given/M.pkl')
get_N = lambda: pd.read_pickle(the_data/'given/N.pkl')

In [4]:
if top_imports.use_M:
    M = get_M()

In [None]:
def convert32(F):
    '''(inplace) just converts all columns in dataframe `F` into 32-bit dtypes'''
    for c in F.columns:
        if np.issubdtype(F[c].dtype, np.float):
            if F[c].dtype != np.float32:
                F[c] = F[c].astype(np.float32)
        elif np.issubdtype(F[c].dtype, np.integer):
            if F[c].dtype != np.int32:
                F[c] = F[c].astype(np.int32)
        else:
            assert False, 'dtype other than float or int found in features'
    gc.collect()

#### very important ID assignment code

In [5]:
if top_imports.use_M:
    class IdAssign:
        class NO_MISSING(metaclass=staticclass): # special filler value because `None` might actually be meaningfully used
            pass
        def __init__(self, init, missing, name):
            self.name = name
            self.map = {}
            if missing is not __class__.NO_MISSING:
                self.map[missing] = -1
            i = 0
            for x in init:
                if x not in self.map and x != missing:
                    self.map[x] = i
                    i += 1
            self.cache = None
        def __call__(self, key):
            if key not in self.map:
                self.map[key] = len(self.map)
            return self.map[key]
        def __len__(self):
            return self.map.__len__()
        @property
        def series(self):
            if self.cache==len(self):
                return self._series
            # Python 3.?+ guarantees that dict keys and values are itered in same order, and that order is insertion order
            self._series = pd.Series(list(self.map.keys()), index=list(self.map.values()), name=self.name)
            self._series.index.name = self.name+'Id'
            self.cache = len(self)
            return self._series

    assetCodeSeries = pd.Series(M.assetCode.unique())
    assetCodeIdAssign = IdAssign(assetCodeSeries, missing='', name='assetCode')
    del assetCodeSeries
    assetNameSeries = pd.Series(M.assetName.unique())
    assetNameIdAssign = IdAssign(assetNameSeries, missing='Unknown', name='assetName')
    del assetNameSeries

In [6]:
if top_imports.use_M:
    M['assetCodeId'] = M.assetCode.map(assetCodeIdAssign).astype(int)
    M['assetNameId'] = M.assetName.map(assetNameIdAssign).astype(int)
    if top_imports.use_N:
        N = get_N()
        N['assetNameId'] = N.assetName.map(assetNameIdAssign) # set up assetNameIdAssign

In [7]:
if top_imports.use_N:
    def makeMultimap(fr, to):
        ret = {}
        for f, t in zip(fr, to):
            ret.setdefault(f, set()).add(t)
        return ret
    assetNameMapCodes = makeMultimap(M.assetNameId, M.assetCodeId)

    def makeSinglemap(fr, to):
        ret = {}
        for f, t in zip(fr, to):
            if f in ret:
                assert t==ret[f], 'a "from" element must be mapped to a unique "to" element!'
            else:
                ret[f] = t
        return ret
    assetCodeMapName = makeSinglemap(M.assetCodeId, M.assetNameId)

In [None]:
if top_imports.use_N:
    headlineTagSeries = pd.Series(N.headlineTag.unique())
    headlineTagIdAssign = IdAssign(headlineTagSeries, missing='', name='headlineTag')
    del headlineTagSeries

In [None]:
if top_imports.use_N:
    del N;

In [None]:
# convenience functions, only for interactive use, no scripting/inside loop please because very unoptimized

if top_imports.use_M:
    from collections import Iterable

    def aId2Code(a):
        if hasattr(a, 'map'):
            return a.map(assetCodeIdAssign.series)
        elif isinstance(a, Iterable) and not isinstance(a, str):
            assetCodeMap = assetCodeIdAssign.series.to_dict()
            return type(a)(map(lambda x: assetCodeMap[x], a))
        else:
            return assetCodeIdAssign.series.loc[a]

    if top_imports.use_N:
        def aId2Name(a):
            if hasattr(a, 'map'):
                return a.map(assetNameIdAssign.series)
            elif isinstance(a, Iterable) and not isinstance(a, str):
                assetNameMap = assetNameIdAssign.series.to_dict()
                return type(a)(map(lambda x: assetNameMap[x], a))
            else:
                return assetNameIdAssign.series.loc[a]
        def aCodeId2Name(a):
            if hasattr(a, 'map'):
                return a.map(assetCodeMapName).map(assetNameIdAssign.series)
            elif isinstance(a, Iterable) and not isinstance(a, str):
                assetNameMap = assetNameIdAssign.series.to_dict()
                return type(a)(map(lambda x: assetNameMap[assetCodeMapName[x]], a))
            else:
                return assetNameIdAssign.series.loc[assetCodeMapName[a]]

#### some data specific helper func/structs

In [None]:
stringify_columns = lambda f: '["'+'","'.join(c for c in f)+'"]'

returns_columns = {
    'returnsClosePrevRaw1':'cc', 'returnsOpenPrevRaw1':'oo',
    'returnsClosePrevMktres1':'acc', 'returnsOpenPrevMktres1':'aoo',
    'returnsClosePrevRaw10':'ccTEN','returnsOpenPrevRaw10':'ooTEN',
    'returnsClosePrevMktres10':'accTEN','returnsOpenPrevMktres10':'aooTEN'
}
columns_for_U = set(returns_columns.values()) | set(['open', 'close', 'volume'])
excluded_columns = [
    'time','assetCode','assetName','universe','returnsOpenNextMktres10','quarter','y'
]
exclusion_filter = lambda c: c not in excluded_columns
object_columns = ['assetCode', 'assetName']
enumeration_columns = ['assetCodeId', 'assetNameId']

#### hard coded constants

In [None]:
train_start_time = pd.Timestamp('2009-01-01',tz='UTC')
lookback = 60
shortterm = 21
longterm = 250 + lookback*2 #idk it's hard to reason about how much I _really_ need, so this should be pretty safe

#### setting up only the stocks that we have seen in-universe

In [None]:
if top_imports.use_M:
    inUniCount = M[M.time>=train_start_time].groupby('assetCodeId').universe.sum()
    stocksInUni = (inUniCount != 0).pipe(lambda x: x.index[x])
    assert stocksInUni.is_monotonic_increasing
    xStocksInUni = set(stocksInUni) #EDITCELL
    inUniCountSeries = inUniCount.astype(float) #EDITCELL
    stocksAlways = (inUniCount >= 1981).pipe(lambda x: x.index[x])
    xStocksAlways = set(stocksAlways)

#### set up everything

In [4]:
def set_basic_features(*,M,delete=True):
    M['assetCodeId'] = M.assetCode.map(assetCodeIdAssign)#.astype(int) # much more efficient to process everything as float
    if delete:
        del M['assetCode']
    M['assetNameId'] = M.assetName.map(assetNameIdAssign)#.astype(int)
    if delete:
        del M['assetName']
    for orig_col, new_col in returns_columns.items():
        M[new_col] = np.log1p(M[orig_col])
        if delete:
            del M[orig_col]
    # time features
    M['dayOfYear'] = M.time.dt.dayofyear.astype(float)
    M['dayOfWeek'] = M.time.dt.dayofweek.astype(float)

if top_imports.use_M:
    set_basic_features(M=M,delete=False)

#### P setup

In [None]:
def setup_P(*, F, P, delete=True, target=True, vp=True, always=False):
    P['quarter'] = (P.time.dt.year+(P.time.dt.quarter-1)/4).astype(np.float32)
    
    for c in 'assetCode __0__assetCode __1__assetCode assetName __0__assetName __1__assetName'.split():
        if c+'Id' not in P and c in P:
            mapper = assetCodeIdAssign if 'Code' in c else assetNameIdAssign
            P[c+'Id'] = P[c].map(mapper).astype(np.int32)
        if delete and c in P:
            del P[c]
        
    if target:
        
        if 'y' not in P:
            P.rename(columns={'returnsOpenNextMktres10':'y'}, inplace=True)
            assert 'y' in P
        
        P['target'] = P.y>0
        P['upDown'] = (P.target*2-1).astype(np.float32)
        P['absVal'] = np.abs(P.y)
        
        # weights
        P['no_weight'] = 1.
        
        if True: #temp defs
            P['flat_inUni'] = P.groupby('time').y.transform('count').astype(np.int32)
            P['flat_y2'] = P.y**2
            P['flat_l2sum'] = np.sqrt(P.groupby('time').flat_y2.transform('sum'))
            P['flat_zstd'] = np.sqrt(P.groupby('time').flat_y2.transform('sum') / P.flat_inUni)
        P['flat_weight'] = P.absVal
        P['flatTotal_weight'] = P.flat_weight / P.flat_l2sum
        P['flatNorm_weight'] = P.flat_weight / P.flat_zstd
        if delete:
            del P['flat_inUni'], P['flat_y2'], P['flat_l2sum'], P['flat_zstd']
        
        if vp:
            for i in [1,10,20,60]:
                try:
                    if True: #temp defs
                        P[f'vp{i}_'] = F[f'vp{i}'] / 1e9
                        P[f'vp{i}_y2'] = (P[f'vp{i}_'] * P.y) ** 2
                        P[f'vp{i}_l2sum'] = np.sqrt(P.groupby('time')[f'vp{i}_y2'].transform('sum'))
                    P[f'vp{i}_weight'] = P.absVal * P[f'vp{i}_']
                    P[f'vp{i}Total_weight'] = P[f'vp{i}_weight'] / P[f'vp{i}_l2sum']
                    if delete:
                        del P[f'vp{i}_'], P[f'vp{i}_y2'], P[f'vp{i}_l2sum']
                except KeyError:
                    pass
        
        if always:
            P['always_weight'] = P.absVal*F.assetCodeId.isin(xStocksAlways)

#### pairs setup

In [None]:
# from helper.bisect import bisect_left as binary_search

# @njit
# def index_unique_pairs(A, B, values, repeat):
#     assert len(A)==len(B)
#     seen = [0] * len(values)
#     ans = []
#     for i, (a, b) in enumerate(zip(A, B)):
#         ia = binary_search(values, a); ib = binary_search(values, b);
#         if repeat==-1 or (seen[ia]<repeat and seen[ib]<repeat):
#             seen[ia] += 1; seen[ib] += 1;
#             ans.append(i)
#     return ans

# def make_unique_pairs(*, Dl, repeat=-1):
#     if repeat==-1:
#         return Dl
#     ii = index_unique_pairs(Dl[0].values, Dl[1].values, sorted(set(Dl[0].values)|set(Dl[1].values)), repeat=repeat)
#     return Dl.iloc[ii]

In [None]:
from helper.bisect import bisect_left as binary_search

@njit
def index_unique_pairs(A, B, values, repeat):
    assert len(A)==len(B)
    seen = [0] * len(values)
    ans = []
    for i, (a, b) in enumerate(zip(A, B)):
        ia = binary_search(values, a); ib = binary_search(values, b);
        if repeat==-1 or (seen[ia]<repeat and seen[ib]<repeat):
            seen[ia] += 1; seen[ib] += 1;
            ans.append(i)
    return ans

def make_unique_pairs(*, Dl, repeat=-1):
    if repeat==-1:
        return Dl
    ii = index_unique_pairs(Dl[0].values, Dl[1].values, sorted(set(Dl[0].values)|set(Dl[1].values)), repeat=repeat)
    return Dl.iloc[ii]

def make_pairs(Dl, *, lo, hi=None, repeat=-1):
    D = Dl
    if hi is None:
        hi = (1., 9999)
    D = D[(lo[0]<=D.Corr)&(D.Corr<=hi[0])&(lo[1]<=D.Unic)&(D.Unic<=hi[1])]
    if isinstance(repeat, int):
        D = D.pipe(lambda x: make_unique_pairs(Dl=x, repeat=repeat))
    return (D[0].values, D[1].values)

In [None]:
@njit
def np_cov(x, y):
    '''sigh the Kernel numba doesn't support np.cov yet'''
    assert len(x)==len(y)
    x_, y_ = x.mean(), y.mean()
    return ((x-x_)*(y-y_)).sum()/len(x)

@njit
def np_corr(x, y):
    return np_cov(x, y) / (x.std() * y.std())

# @njit
# def my_corr(x, y):
#     assert len(x)==len(y)
#     return (np_cov(x, y) / ( np.sqrt( ((x-x.mean())**2).sum() * ((y-y.mean())**2).sum() ) / len(x) ))

@njit
def corr_window_at_index(a, cols, labels, index, window, min_periods):
    '''Parameters - a: 2D np.ndarray, we are taking corr at row index `index`, going back `window` rows
                    cols: tuple of (cols[0],cols[1]) where cols[0/1] is the list of column `labels` of which to take corr of
                    labels: the "names" of the 0-n column dimensions, that `cols` references
                    index: row index which is the last row in the correlation window
                    window: int, size of window
    Return 2-tuple of np.ndarrays of shape (len(cols[0]),) containing
        [0] correlation for every pair of columns in same order as `cols`
        [1] common in universe count for the same pair'''
    
    if index < 0:
        index += len(a)
    assert index+1 - window >= 0, '{corr_window_at_index} index goes back before window'
    
    corr = np.full((len(cols[0]),), np.nan, dtype=np.float32)
    unic = np.full((len(cols[0]),), np.nan, dtype=np.float32)
    
    for k in range(len(cols[0])):
        j0, j1 = binary_search(labels, cols[0][k]), binary_search(labels, cols[1][k])
        x0, x1 = a[index+1-window:index+1, j0], a[index+1-window:index+1, j1]
        notnan = ~(np.isnan(x0)|np.isnan(x1))
        
        sum_notnan = notnan.sum()
        if sum_notnan < min_periods:
            corr[k] = np.nan
            unic[k] = sum_notnan
            continue
        else:
            x0, x1 = x0[notnan], x1[notnan]
            guy = np_corr(x0, x1)
            corr[k] = guy
            unic[k] = sum_notnan
    
    return corr, unic


@njit
def corr_windows_at_indices(a, cols, labels, indices, window, min_periods):
    '''Parameters - a: 2D np.ndarray, we are taking corr at row index `index`, going back `window` rows
                    cols: tuple of (cols[0],cols[1]) where cols[0/1] is the list of column `labels` of which to take corr of
                    labels: the "names" of the 0-n column dimensions, that `cols` references
                    indices: (same shape as `cols`) of row indices, an entry being the last row in the correlation window
                    window: int, size of window
    Return 2-tuple of np.ndarrays of shape (len(cols[0]),) containing
        [0] correlation for every pair of columns in same order as `cols`
        [1] common in universe count in the window for the same pair'''
        
    corr = np.full((len(cols[0]),), np.nan, dtype=np.float32)
    unic = np.full((len(cols[0]),), np.nan, dtype=np.float32)
    
    for k in range(len(cols[0])):
        index = indices[k]
        
        if index < 0:
            index += len(a)
        assert index+1 - window >= 0, '{corr_window_at_indices} index goes back before window'
        
        j0, j1 = binary_search(labels, cols[0][k]), binary_search(labels, cols[1][k])
        x0, x1 = a[index+1-window:index+1, j0], a[index+1-window:index+1, j1]
        notnan = ~(np.isnan(x0)|np.isnan(x1))
        
        sum_notnan = notnan.sum()
        if sum_notnan < min_periods:
            corr[k] = np.nan
            unic[k] = sum_notnan
            continue
        else:
            x0, x1 = x0[notnan], x1[notnan]
            guy = np_corr(x0, x1)
            corr[k] = guy
            unic[k] = sum_notnan
    
    return corr, unic

In [None]:
# def make_CC_G_Q_from_Dl_F_P_W(*, Dl, F, P, W, lo, hi=None, repeat=-1,
#                               roll_corr=list(product(['aoo1','aoo10'],[10,21,62,250]))):
#     SEE(f'(pair) len(F.columns) = {len(F.columns)}')
    
#     D = Dl
#     allTime = pd.Series(P.time.unique())
#     allTime.index = allTime
    
#     # |begin| subset the pairs list
#     if hi is None:
#         hi = (1., 9999)
#     D = D[(lo[0]<=D.Corr)&(D.Corr<=hi[0])&(lo[1]<=D.Unic)&(D.Unic<=hi[1])]
#     if isinstance(repeat, int):
#         D = D.pipe(lambda x: make_unique_pairs(Dl=x, repeat=repeat))
# #         w = (D.Corr.fillna(0)*0 + 1.).values # old "weight" code, weight before combining the __0__y-__1__y
#     elif repeat=='prop':
#         assert False, '"prop" weighting for pairs not implemented yet!'
#         D01 = D
#         D10 = D[[1,0,'Corr','Unic']]
#         D2 = pd.concat([D01,D10], axis=0)
#         aw = D2.groupby(0).Corr.sum()
#         w = (D.Corr / np.maximum(D[0].map(aw), D[1].map(aw))).values
#         w /= w.mean()
#     else:
#         assert False
#     hgt = sum(p[-1] for p in D.itertuples())
#     CC = np.zeros((hgt,2+len(roll_corr)))
#     G0, G1 = np.zeros((hgt,len(F.columns))), np.zeros((hgt,len(F.columns)))
#     Q0, Q1 = [], []
#     # |end| subset the pairs list
    
#     i = 0
#     SEE(f'(pair) generating [{len(D)}]', end='')
#     for ii, (_,a0,a1,corr,unic) in enumerate(D.itertuples()):
#         print_progress(ii, dot=10, print=SEE)
# #         assert len(weight.shape)==0
#         in0, in1 = (F.assetCodeId==a0)&(P.universe!=0), (F.assetCodeId==a1)&(P.universe!=0)
#         A0, A1 = F[in0], F[in1]
#         B0, B1 = P[in0], P[in1]
#         inTime = set(B0.time)&set(B1.time)
#         tm0, tm1 = B0.time.isin(inTime), B1.time.isin(inTime)
#         A0, A1 = A0[tm0], A1[tm1]
#         B0, B1 = B0[tm0], B1[tm1]
        
#         tm = allTime.isin(inTime).values
#         def iter_roll_corr():
#             for y, r in roll_corr:
#                 yield W[y][a0].rolling(window=r, min_periods=5).corr(W[y][a1])[tm] #TODO hard coded min_periods
#         unic_ = [np.ones_like(W[next(iter(W))].iloc[:,0][tm])*unic]
# #         weight_ = [np.ones_like(W[next(iter(W))].iloc[:,0][tm])*weight]
#         C = np.stack(chain(iter_roll_corr(),unic_), axis=1)
        
#         G0[i:i+unic], G1[i:i+unic] = A0.values, A1.values
#         Q0.append(B0); Q1.append(B1);
#         CC[i:i+unic] = C
        
#         i += unic
#     SEE()
    
#     Fcols = F.columns
#     del Dl, F, P, W; gc.collect()
        
#     Q0, Q1 = pd.concat(Q0, axis=0), pd.concat(Q1, axis=0)
#     G0, G1 = pd.DataFrame(G0, index=Q0.index, columns=Fcols), pd.DataFrame(G1, index=Q1.index, columns=Fcols)
#     CC = pd.DataFrame(CC, columns=[f'__corr__{y}_{r}' for y,r in roll_corr]+['bothInUniCount'])
#     for x in [CC, G0, G1, Q0, Q1]:
#         convert32(x)
#     return CC, (G0,G1), (Q0,Q1)

In [None]:
def make_CC_G_Q(pairs, *, F, P, W, labels=None, roll_corr=list(product(['aoo','aooTEN','aooTTY'],[10,21,62,125,250]))):
#     print('___ make_CC_G_Q')
#     print('>>> pairs[0]', pairs[0].dtype)
    pairId = pairs[0]*100_000+pairs[1]
#     print('>>> pairId', pairId.dtype)
    single = P.time.nunique()==1
    if single:
        I0 = pd.DataFrame({'assetCodeId': pairs[0], 'pairId': pairId}, dtype=np.int32)
#         print('>>> I0', I0.dtypes)
        I1 = pd.DataFrame({'assetCodeId': pairs[1], 'pairId': pairId}, dtype=np.int32)
#         print('>>> I1', I1.dtypes)
        # merge on P to get the intersected guys
        Q0, Q1 = I0.merge(P, on='assetCodeId', how='inner'), I1.merge(P, on='assetCodeId', how='inner')
#         print('>>> Q0', Q0.dtypes)
#         print('>>> Q1', Q1.dtypes)
        # merge on intersected
        Q0 = Q1[['pairId']].merge(Q0, how='inner'); Q1 = Q0[['pairId']].merge(Q1, how='left');
#         print('>>> Q0', Q0.dtypes)
#         print('>>> Q1', Q1.dtypes)
        # make intersected set for merging F
        G0 = Q0[['assetCodeId']].merge(F, on='assetCodeId', how='left')
#         print('>>> G0', G0.dtypes)
        G1 = Q1[['assetCodeId']].merge(F, on='assetCodeId', how='left')
#         print('>>> G1', G1.dtypes)
        # corr_window_at_index(a, cols, labels, index, window, min_periods=10)
        
        pairs = G0.assetCodeId.values.astype(np.int32), G1.assetCodeId.values.astype(np.int32)
#         print('>>> !!!', G0.assetCodeId.values.dtype, G1.assetCodeId.values.dtype)
        def iter_corr():
            for c, h in roll_corr:
                if isinstance(W[c], pd.DataFrame):
                    index = binary_search(W[c].index.values, P.time.values[-1])
                    assert W[c].columns.is_monotonic_increasing, 'W[c].columns not monotonic increasing'
                    yield corr_window_at_index(W[c].values, pairs, W[c].columns.values, index, h, min_periods=10)
                elif isinstance(W[c], np.ndarray):
                    index = -1
                    yield corr_window_at_index(W[c], pairs, labels, index, h, min_periods=10)
                else:
                    assert False, 'W[c] datatype wrong'
        #BUG.guy = guy = list(iter_corr())
        CC = np.stack(list(flatten(iter_corr())), axis=1)
#         print('>>> CC', CC.dtype)
        #TODO inside CC, bothInUniCount i.e. unic and weight #DONE actually forget weight we will only use flat weight
    else:
        # make time list
        P['_mt'] = P.time.astype(int)
#         print('>>> P', P.dtypes)
        times = P.time.unique()
        x_times = {t: i for i, t in enumerate(times)}
        _mts = P._mt.unique()
        
        # make canonical index template
        def _index(pair):
            a = np.asarray(list(product(_mts, pair)))
            b = np.asarray(list(product(_mts, pairId)))
            return {'_mt': a[:,0], 'assetCodeId': a[:,1].astype(np.int32), 'pairId': b[:,1].astype(np.int32)}
        I0, I1 = pd.DataFrame(_index(pairs[0])), pd.DataFrame(_index(pairs[1]))
#         print('>>> I0', I0.dtypes)
#         print('>>> I1', I1.dtypes)
        # do P first to get merge columns
        Q0, Q1 = I0.merge(P,on=['_mt','assetCodeId'],how='inner'), I1.merge(P,on=['_mt','assetCodeId'],how='inner')
#         print('>>> Q0', Q0.dtypes)
#         print('>>> Q1', Q1.dtypes)
        # get common rows
        Q0 = Q1[['_mt','pairId']].merge(Q0, how='inner'); Q1 = Q0[['_mt','pairId']].merge(Q1, how='left');
#         print('>>> Q0', Q0.dtypes)
#         print('>>> Q1', Q1.dtypes)
        # now do F
        F['_mt'] = P._mt
#         print('>>> F', F.dtypes)
        G0 = Q0[['_mt','assetCodeId']].merge(F, on=['_mt','assetCodeId'], how='left')
#         print('>>> G0', G0.dtypes)
        G1 = Q1[['_mt','assetCodeId']].merge(F, on=['_mt','assetCodeId'], how='left')
#         print('>>> G1', G1.dtypes)
        del F['_mt'], P['_mt'], Q0['_mt'], Q1['_mt'], G0['_mt'], G1['_mt']
        
        pairs = G0.assetCodeId.values.astype(np.int32), G1.assetCodeId.values.astype(np.int32)
#         print('>>> !!!', G0.assetCodeId.values.dtype, G1.assetCodeId.values.dtype)
        def iter_corr():
            for c, h in roll_corr:
                if isinstance(W[c], pd.DataFrame):
                    mapper = dict(zip(W[c].index, np.arange(len(W[c].index))))
                    indices = Q0.time.map(mapper).values
                    BUG.W, BUG.c = W, c
                    assert W[c].columns.is_monotonic_increasing, 'W[c].columns not monotonic increasing'
                    yield corr_windows_at_indices(W[c].values, pairs, W[c].columns.values, indices, h, min_periods=10)
                elif isinstance(W[c], np.ndarray):
                    indices = (Q0.time.map(x_times) - len(x_times)).values
                    yield corr_windows_at_indices(W[c], pairs, labels, indices, h, min_periods=10)
                else:
                    assert False, 'W[c] data type wrong'
        BUG.guy = guy = list(flatten(iter_corr()))
        CC = np.stack(guy, axis=1)
#         print('>>> CC', CC.dtype)
    assert len(CC)==len(G0)==len(Q0), 'lengths of CC G[0/1] Q[0/1] need to be the same'
    
    G0.reset_index(inplace=True, drop=True); G1.reset_index(inplace=True, drop=True);
    Q0.reset_index(inplace=True); Q1.reset_index(inplace=True);
    return (
        pd.DataFrame(CC,columns=flatten([f'__corr__{y}_{r}',f'__unic__{y}_{r}'] for y,r in roll_corr)),
        (G0, G1),
        (Q0, Q1)
    )

In [None]:
def make_FG_PQ(copy_filter, diff_filter, plus_filter, corr_filter, *, CC, G, Q):
#     print('___ make_FG_PQ')
    SEE('(pair) putting together...', end=' ')
#     print('>>> CC', CC.dtypes)
#     print('>>> G[0]', G[0].dtypes)
#     print('>>> G[1]', G[1].dtypes)
#     print('>>> Q[0]', Q[0].dtypes)
#     print('>>> Q[1]', Q[1].dtypes)
    # weight = CC['weight'].values # old code
    
    Gcols, Qcols = G[0].columns, Q[0].columns
    assert (Gcols==G[1].columns).all() and (Qcols==Q[1].columns).all()
    
    
    cop, dif, pls = list(filter(copy_filter, Gcols)), list(filter(diff_filter, Gcols)), list(filter(plus_filter, Gcols))
    G0, G1 = G[0][cop].values, G[1][cop].values
#     print('>>> G0', G0.dtype)
#     print('>>> G1', G1.dtype)
    H01 = G[0][dif].values.astype(np.float32) - G[1][dif].values.astype(np.float32)
#     print('>>> H01', H01.dtype)
    I01 = G[0][pls].values.astype(np.float32) + G[1][pls].values.astype(np.float32)
#     print('>>> I01', I01.dtype)
    CC = CC[list(filter(corr_filter, CC.columns))]
#     print('>>> CC', CC.dtypes)
#     print('>>> CC.values', CC.values.dtype)
    
    F01 = np.concatenate([G0,G1,H01,I01,CC.values.astype(np.float32)], axis=1)
    F10 = np.concatenate([G1,G0,-H01,I01,CC.values.astype(np.float32)], axis=1)
#     print('>>> F01', F01.dtype)
#     print('>>> F10', F10.dtype)
    FG = np.concatenate([F01, F10], axis=0)
#     print('>>> FG', FG.dtype)
    if not np.issubdtype(FG.dtype, np.float):
        FG = FG.astype(np.float32)
#     print('>>> FG', FG.dtype)
    
    FGcols = (['__0__'+c for c in cop] + ['__1__'+c for c in cop] + ['__0-1__'+c for c in dif]
              + ['__0+1__'+c for c in pls] + list(CC.columns))
    FG = pd.DataFrame(FG, columns=FGcols, copy=False)
#     print('>>> FG', FG.dtypes)
    
    
    Q = Q[0].reset_index(drop=True), Q[1].reset_index(drop=True)
#     print('>>> Q', Q[0].dtypes, Q[1].dtypes)
    assert(Q[0].time==Q[1].time).all()
    
    #TODO again we don't need universe here
    Q01 = pd.DataFrame(dict(time=Q[0].time,
        **{f'__0__{c}': Q[0][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[0]},
        **{f'__1__{c}': Q[1][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[1]}))
    if 'y' in Q[0]:
        assert 'y' in Q[1]
        Q01['y'] = Q[0].y - Q[1].y
    Q10 = pd.DataFrame(dict(time=Q[1].time,
        **{f'__0__{c}': Q[1][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[1]},
        **{f'__1__{c}': Q[0][c] for c in ['assetCode','assetCodeId','assetName','assetNameId'] if c in Q[0]}))
    if 'y' in Q[1]:
        assert 'y' in Q[0]
        Q10['y'] = Q[1].y - Q[0].y
#     print('>>> Q01', Q01.dtypes)
#     print('>>> Q10', Q10.dtypes)
    
    PQ = pd.concat([Q01, Q10], axis=0, ignore_index=True)
#     PQ['pair_weight'] = 1 # np.concatenate([weight, weight], axis=0) # old code
#     print('>>> PQ', PQ.dtypes)
    
    PQ.sort_values('time', inplace=True)
    FG = FG.reindex(index=PQ.index, copy=False)
    
    FG.reset_index(drop=True, inplace=True)
    PQ.reset_index(drop=True, inplace=True)
    setup_P(F=None, P=PQ, target=True, vp=False, always=False)
    
    SEE('(pair) done')
    return FG, PQ