In [None]:
from common import *
from search.imports import *

In [1]:
from collections import Iterable

In [1]:
def weight_stack_trans(x):
    bot = x.min()
    assert bot<0
    return np.exp((x-bot)/bot)

## parameter search logic

In [None]:
class ParamSearch():
    mix = staticmethod(logavg)
    
    def __init__(self, specs):
        self.specs = O.mycopy(specs)
        self.setup_discrete()
        
    def setup_discrete(self):
        dsc = self.specs.Discrete
        dsc.keys = list(flatten( dsc.enc.keys() ))
        dsc.assigns = [list(flatten(x)) for x in product(* dsc.enc.values() )]
        
    def setup_onebyone(self, label):
        obo, oboe = self.specs.OneByOne, self.specs.OneByOne.enc[label]
        
        if not isinstance(obo.base, Iterable):
            obo.base = [obo.base]
        
        data = obo.data = {}
        for k,v in dict.items(oboe):
            u = data[k] = O(**v)
            for i,x in dict.items(obo.default):
                if i not in u:
                    u[i] = x
                elif isinstance(u[i], LambdaType):
                    u[i] = u[i](x)
    
    def search(self):
        self._best = -float('inf')
        dsc, obo = self.specs.Discrete, self.specs.OneByOne
        for di, assign in enumerate(dsc.assigns):
            assert 0 not in dsc.stop
            if di in dsc.stop and dsc.stop[di] > self._best:
                break
            if di in dsc.obo:
                self.setup_onebyone(dsc.obo[di])
            params = dict(zip(dsc.keys, assign))
            coroutine = self.one_by_one()
            for addon in coroutine:
                SEE.queue({k:v for k,v in dict.items(params) if k not in addon})
                SEE.queue(addon)
                params.update(addon)
                results = (yield copy.deepcopy(params))
                self._best = max(results['score'], self._best)
                coroutine.send(results); assert (yield) is None
        
        
    def one_by_one(self):
        obo, obod = self.specs.OneByOne, self.specs.OneByOne.data
        
        #! main algorithm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        
        # initialize loop variables
        params = {k: (v.b if 'b' in v else v.a[1]) for k,v in dict.items(obod)}
        ranges = {k: v.a for k,v in dict.items(obod)}
        scores = {k: [-np.inf, -np.inf] for k in params}
        isdone = {k: False for k in params}
        
        # pre loop one-off work
        base_score = (yield params)['score']; assert (yield) is None
        
        # loop
        for i in range(999_999_999):
            cutoff = obo.base[i] if i<len(obo.base) else obo.base[-1]
            if base_score <= cutoff:
                return
            
            #! try new parameter values ############################## part A of loop work
            
            # initialize local loop variable
            new_scores = {k: [-np.inf, -np.inf] for k in params}
            new_params = {k: ( obod[k].cast(self.mix(v[0], v[1])),
                               obod[k].cast(self.mix(v[1], v[2])) ) for k,v in dict.items(ranges)}
            
            # finish condition check
            isdone = {k: v or i>=obod[k].lim for k,v in dict.items(isdone)}
            if all(isdone.values()):
                break
                
            # try new parameter values for all parameters
            for key in list(params):
                if i >= obod[key].lim:
                    continue
                orig = params[key]
                params[key] = new_params[key][0]
                scores[key][0] = (yield params)['score']; assert (yield) is None
                params[key] = new_params[key][1]
                scores[key][1] = (yield params)['score']; assert (yield) is None
                params[key] = orig
            
            #! start setting up values for next loop ######################## part B of loop work
            
            # set params to the best found and see if it betters score, updating ranges also
            #CODE num_nochange = 0
            for key in list(params):
                if scores[key][0] > base_score and scores[key][0] >= scores[key][1]:
                    params[key] = new_params[key][0]
                    ranges[key] = [ranges[key][0], params[key], ranges[key][1]]
                elif scores[key][1] > base_score and scores[key][1] >= scores[key][0]:
                    params[key] = new_params[key][1]
                    ranges[key] = [ranges[key][1], params[key], ranges[key][2]]
                else:
                    ranges[key] = [new_params[key][0], ranges[key][1], new_params[key][1]]
                    #num_nochange += 1
                
            # send out new params
            #CODE if num_nochange < len(params):
            base_score = (yield params)['score']; assert (yield) is None

## Features / Samples(train/cv split) search logic

### Model searching logic

In [None]:
class ModelSearch():
    
    def __init__(self, specs, mm, *, log=None, tag=None, tags=None, verbose_eval=None):
        self.specs = specs
        self.log = log
        global SEE #TODO unfortunate very hack, this is just to get ParamSearch to print in a way consistent with
        SEE = log.print # previously done search logs, i.e. have a logger that prints on a level higher than self.log
        assert tag is None or tags is None
        if tag is None and tags is None:
            tags = []
        elif tags is None:
            tags = [tag]
        self.tags = frzset(tags)
        self.verbose_eval = verbose_eval
        self.mm = mm
        self._DBG_ = O()
        if 'weight_stack' in self.specs.model and self.specs.model.weight_stack:
            self.e = pd.read_pickle('/big/data/saves/e-amour-tt7.pickle')
#         self.setup_specs()
        
#     def setup_specs(self):
#         for si in range(self.mm.S.nextIndex):
#             ctor = self.mm.S.get('Ctor', i=si)
#             samps = self.specs.Samples.ctor2samps(ctor)
#             self.mm.S.save(Samps=samps, i=si)
        
#     def setup_specs(self):
#         self.specs.Samples.data = []
#         for code in self.specs.Samples.enc:
#             if code.method == 'group.2':
#                 tr, cv = set(code.data[0]), set(code.data[1])
#                 tr, cv = self.Y[code.groups].isin(tr), self.Y[code.groups].isin(cv)
#                 tr, cv = self.Y.index[tr], self.Y.index[cv]
#                 self.specs.Samples.data += [((tr, cv), (cv, tr))]
#             elif code.method == 'GroupShuffleSplit.2':
#                 tr, cv = next(GroupShuffleSplit(**code.kwargs).split(self.X, self.Y, groups=self.Y[code.groups]))
#                 tr, cv = self.Y.index[tr], self.Y.index[cv]
#                 self.specs.Samples.data += [((tr, cv), (cv, tr))]
# #             elif code.method == 'GroupKFold':
# #                 self.specs.Samples.data.append(tuple(GroupKFold(**code.kwargs)
# #                                                      .split(self.X, self.Y, groups=self.Y[code.groups])))
#             else:
#                 assert False, f'sampling method "{code.method}" not implemented'
        
    def iter_feats_samps(self, t=0):
        t = t-1
        while True:
            t = self.specs.Step.func(self._best[1].Results if hasattr(self, '_best') else None, t=t)
            self.log(f't = {t}', '='*99)
            if t is None:
                break
            
            featsObj = self.specs.Features.func(t=t)
            sampsObj = self.specs.Samples.func(t=t)
            if featsObj is None or sampsObj is None:
                break
            fi = self.mm.F.save(**featsObj)
            si = self.mm.S.save(**sampsObj)
            self._DBG_.fs = featsObj, sampsObj
            
            self.X, self.Y = self.specs.Data.func(t=t)
            assert (self.X.index == self.Y.index).all()
            
            yield (fi, featsObj['Feats']), (si, sampsObj['Samps'])
        
    def walk(self, t=0):
        for (fi, feats), (si, samps) in self.iter_feats_samps(t):
            best_score = -np.inf # this best score here is on a per (feats,samps) basis, maxed over params
            pm = self.mm.iPM(fi, si)
            self._pm = pm
            
            self.ho = self.specs.Holdout.func(t=t)
            if not isinstance(self.ho, list): #NEW k-fold flow
                self.ho = [self.ho]
            self.setup_training(feats, samps, i=(fi,si))
            search = ParamSearch(self.specs.search)
            loop = search.search()
            for k, params in enumerate(loop): # params are deepcopied out, so can safely save them as-is!
                self.log(f' k = {k}', '-'*99)
                paramsUse = dict(**self.specs.Params.data, **params)
                
                try:
                    del params['num_iterations'] # num_iterations too finicky to save
                except KeyError:
                    pass
                
                pi = pm.i(params)
                results = pm.get('Results', i=pi)
                if results:
                    if results['score'] > best_score:
                        self._save = O(Results=results)
                        self._best = (dict(**params), O(**self._save)) #TODO I still copy params here to be safe (needed)?
                else:
                    self.log('training...')
                    self.train(paramsUse) # sets some state attributes in self: self._save
                    results = self._save.Results
                    if results['score'] > best_score:
                        self._best = (dict(**params), O(**self._save)) #TODO I still copy params here to be safe (needed)?
                    del self._save.Boosters#, self._save.Training
                    already_tags = pm.get('Tags', i=pi)
                    already_tags = already_tags if already_tags is not None else frozenset()
                    pm.save(i=pi, Tags=already_tags|self.tags, **self._save)
                loop.send(results)
                yield (fi, si), results
                #del self._save #TODO probably wanna uncomment this in production
            #pm.save(Params=self._best[0], **self._best[1]) #TODO forget saving the best model
            
    def run(self, t=0):
        for _ in self.walk(t):
            pass
        
    def setup_training(self, feats, samps, *, i=None):
        SEE('setup_training', '%'*44)
        SEE(f'len(feats) = {len(feats)}')
        SEE(self._DBG_.fs[0]['Ctor'] if 'Ctor' in self._DBG_.fs[0] else self._DBG_.fs[0]['Feats'])
        SEE(f'len(samps) = {len(samps)}')
        SEE(self._DBG_.fs[1]['Ctor'] if 'Ctor' in self._DBG_.fs[1] else self._DBG_.fs[1]['Samps'])
#         samps = self.mm.S.get('Samps', i=i[1]) # this `samps` is a tuple of tuple of frozensets
        if hasattr(self, '_X'):
            delattr(self, '_X')
        if list(self.X.columns) != list(feats):
            _X = self._X = self.X[feats]
        else:
            _X = self._X = self.X
        #_dummy = pd.Series(range(len(_X)), index=_X.index)
        _s = self._s = O()
        _s.tr, _s.cv = tuple(_X.index.isin(s[0]) for s in samps), tuple(_X.index.isin(s[1]) for s in samps)
        lgb_data_info = dict(
            feature_name = list(_X.columns),
            categorical_feature = list(_X.dtypes[_X.dtypes.isin([np.int64,np.int32])].index),
            free_raw_data = False,
        )
        _L = self._L = O()
#         #DEBUG: #NOPE
#         ctor = self._DBG_.fs[0]['Ctor']
#         name = ctor[ctor.index('<')+1:ctor.index('>')]
#         pd.to_pickle((_X, self.Y, lgb_data_info, self.specs.model.target, self.specs.model.weight, _s.tr, _s.cv),
#                      f'/big/data/fuck/{name}.pkl')
#         #:DEBUG #NOPE

        def _weight(ds):
            if 'weight_stack' not in self.specs.model or not self.specs.model.weight_stack:
                return self.Y[self.specs.model.weight][ds]
            else:
                name = dict(flat_weight='a',vp10_weight='m')[self.specs.model.weight] + self.specs.model.name
                print('_weight', name)
                return self.Y[self.specs.model.weight][ds] * weight_stack_trans(self.Y.time[ds].map(self.e[name]))
        _L.tr = [lgb.Dataset(_X[tr], self.Y[self.specs.model.target][tr], **lgb_data_info,
                            **({'weight': _weight(tr)} if 'weight' in self.specs.model else {}))
                for tr in _s.tr]
        _L.cv = [lgb.Dataset(_X[cv], self.Y[self.specs.model.target][cv], reference=Ltr, **lgb_data_info,
                            **({'weight': _weight(cv)} if 'weight' in self.specs.model else {}))
                for cv, Ltr in zip(_s.cv, _L.tr)]
        self.Lho = [lgb.Dataset(_X[h], self.Y[self.specs.model.target][h], **lgb_data_info,
                            **({'weight': _weight(h)} if 'weight' in self.specs.model else {}))
                for h in self.ho] #NEW k-fold flow
        m = self.specs.metric
        if hasattr(m, 'attach'):
            m.attach(self)
        #TODO implement both logloss and Kaggle metric, and stop only when both don't improve in whatever num rounds
    
    def train(self, params):
        def iter_samples():
            Lho = self.Lho
            ho_names = ['ho'+str(i) for i in range(len(self.ho))]
            for Ltr, Lcv in zip(self._L.tr, self._L.cv):
                evals_result = {}
                # fucking LightGBM deletes 'num_iterations' from `params` after training, like WTF???
                def _temp():
                    self._gbm = lgb.train(dict(params), Ltr, valid_sets=[Ltr,Lcv]+Lho, valid_names=['tr','cv']+ho_names,
                        feval=self.specs.metric, evals_result=evals_result, verbose_eval=self.verbose_eval)
                if hasattr(self, '_gbm'):
                    del self._gbm
                %time _temp()
                bst = self._gbm
                
                df_results = (
                    (pd.DataFrame(evals_result['tr']), pd.DataFrame(evals_result['cv'])) +
                    tuple(pd.DataFrame(evals_result['ho'+str(i)]) for i in range(len(self.ho)))
                )
                yield bst, df_results
        bsts, dfs = zip(*iter_samples())
        
        # find best nboost
        _ms = self.specs.metric.score
        _i, _sco = None, -np.inf
        for i in dfs[0][1].index: # 1 for cv (0 for tr, and 2+ for ho)
            sco = self.specs.agg_metric(dfs, 1, i, _ms)
            if sco > _sco:
                _i = i
                _sco = sco
        
#         # find best nboost # use new flow with multiple transformed scores and take the best one
#         wait = self.specs.early_stopping
#         _i, _ms, _sco = None, None, -np.inf
#         ct = 0
#         for i in dfs[0][1].index: # in second layer of index, 1 for cv (0 for tr, and 2+ for ho)
#             better = False
#             for metric in dfs[0][1].columns:
#                 sco = self.specs.agg_metric(dfs, 1, i, metric)
#                 if sco > _sco:
#                     _sco = sco
#                     _ms = metric
#                     better = True
#             if better:
#                 _i = i
#                 ct = 0
#             else:
#                 ct += 1
#                 if ct > wait:
#                     _i = i-ct
#                     break
        
        class save(O()):
            Training = dfs
            Boosters = bsts
            class Results(O()):
                nbest = _i+1
                train = tuple(dft[0].loc[_i,_ms] for dft in dfs)
                scores = tuple(dft[1].loc[_i,_ms] for dft in dfs)
                score = self.specs.agg_metric(dfs, 1, _i, _ms)
                holdout = tuple(tuple(dft[j].loc[_i,_ms] for j in range(2,len(dft))) for dft in dfs)
        
        # new addition, hook to save actual answers
        def iter_ans():
            for bst, tr, cv, Ltr, Lcv in zip(bsts, self._s.tr, self._s.cv, self._L.tr, self._L.cv):
                assetCodes = ['assetCodeId'] if 'assetCodeId' in self.Y else ['__0__assetCodeId','__1__assetCodeId']
                ans = []
                _Y = self.Y[['time']+assetCodes+[self.specs.model.target,self.specs.model.weight]][cv]
                _Y['guess'] = bst.predict(self._X[cv], num_iteration=_i+1)*2-1
                ans.append(_Y)
                for ho in self.ho:
                    _Y = self.Y[['time']+assetCodes+[self.specs.model.target,self.specs.model.weight]][ho]
                    _Y['guess'] = bst.predict(self._X[ho], num_iteration=_i+1)*2-1
                    ans.append(_Y)
                yield ans
        save.Answers = list(iter_ans())
        # end new addition
        
        self._save = save