In [1]:
from common import *
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import json, copy, operator

In [22]:
from types import LambdaType

In [2]:
TEST = True; PROD = False

In [3]:
I = lambda x: x
logavg = lambda x,y: math.expm1((math.log1p(x)+math.log1p(y))/2)
keepSigFig = lambda n: lambda x: round(x, -int(math.floor(math.log10(abs(x)))) + (n - 1)) if x else x

In [4]:
frzset = lambda x: x if isinstance(x, frozenset) else frozenset(x)
pydict = lambda x: O.pycopy(x) if isinstance(x, O) else copy.deepcopy(x)

# #!#!#!#!#!#!#!#!#!#!#! Save System #!#!#!#!#!#!#!#!#!#!#!

## File System

In [5]:
class IndexFileSystem():
    def __init__(self, directory, key):
        self.dir = directory if isinstance(directory, Path) else Path(directory)
        self.dir.mkdir(exist_ok=True, parents=True)
        assert isinstance(key, str), "key must be string"
        self.key = key
        self.io = O()

    def iterIndices(self):
        for dot_params in self.dir.glob('*.' + self.key):
            yield int(dot_params.stem)

    def getFilePath(self, name, *, i):
        return self.dir / (str(i) + '.' + name)

    @staticmethod
    def readWrapper(read):
        '''wraps io read operations to safely return None if file does not exist'''
        @wraps(read)
        def read_safely(*a, **k):
            try:
                return read(*a, **k)
            except FileNotFoundError:
                return None
        return read_safely

    def assignIO(self, name, *, read, write, format='custom'):
        assert format in ['bytes', 'text', 'custom'], "argument `format` must be one of 'bytes' or 'text' or 'custom'"
        if format in ['bytes', 'text']:
            self.io[name] = O(
                read = self.readWrapper( lambda *,i: read(getattr(self.getFilePath(name, i=i), 'read_'+format)()) ),
                write = lambda x,*,i: getattr(self.getFilePath(name, i=i), 'write_'+format)(write(x))
            )
        elif format == 'custom':
            def readCustom(*, i):
                file = self.getFilePath(name, i=i)
                if not file.exists():
                    return None
                try:
                    return read(file)
                except Exception:
                    return read(str(file))
            def writeCustom(*, i):
                file = self.getFilePath(name, i=i)
                try:
                    write(x, file=file)
                except Exception:
                    write(x, file=str(file))
            self.io[name] = O(read=readCustom, write=writeCustom)

class IndexDataStore():    
    def __init__(self, file, _factory_=False):
        if not _factory_:
            assert False, "Cannot instantiate IndexDataStore normally. Please use factory static method."
        self.file = file
        self.op = O()
        self.lists = O()
        self.key = None
        self.keyFunc = lambda keyVal, *, client=False: object()
        self.tbl = {}
        self.nextIndex = 0
        
    def load(self):
        indices = sorted(self.file.iterIndices())
        n = self.nextIndex = (max(indices) if indices else -1) + 1
        for name, _ in dict.items(self.lists):
            self.lists[name] = [None] * self.nextIndex
        
        for i in indices:
            for name in self.lists:
                if self.op[name].load:
                    self.op[name].load(i=i)
                
        if self.key is not None:
            self.assignKey(self.key, self.keyFunc)
            
    def i(self, key=None, *, dry=False, client=True, keep=True, save=True, **kw):
        if len(kw) > 0:
            assert len(kw)==1, 'only keyword arg allowed to method `i` is the key name'
            assert self.key in kw, 'only keyword arg allowed to method `i` is the key name'
            assert key is None, 'argument `key` already given as keyword using the key name'
            key = kw[self.key]
        else:
            assert key, '`key` must be given by position if keyword with corresponding key name is not given'
        
        keyVal = self.keyFunc(key, client=client)
        
        if keyVal not in self.tbl and not dry:
            for name in self.lists:
                assert len(self.lists[name]) == self.nextIndex, "idk internal error"
                self.lists[name].append(None)
            self.tbl[keyVal] = self.nextIndex
            if save and self.op[self.key].save:
                self.op[self.key].save(key, i=self.nextIndex, keep=keep)
            elif keep and self.op[self.key].keep:
                self.op[self.key].keep(key, i=self.nextIndex, client=client)
            self.nextIndex += 1
        elif keyVal not in self.tbl and dry:
            return None
        
        return self.tbl[keyVal]
            
    def save(self, keep=True, **kwargs):
        '''`client` is always True'''
        assert ('i' in kwargs) ^ (self.key in kwargs), "call to `save` must include exactly one of i= or the key name ="
        if 'i' in kwargs:
            i = kwargs['i']
            del kwargs['i']
        else:
            i = self.i(kwargs[self.key])
        
        for name in kwargs:
            if name == self.key: # already saved in `self.i(save=True)` above
                continue
            if name not in self.op:
                raise AssertionError(f"given save item '{name}' does not have data store ops initialized")
            if self.op[name].save:
                self.op[name].save(kwargs[name], i=i, keep=keep)
        
        return i
            
    def get(self, name, **kwargs):
        assert len(kwargs) == 1 and ('i' in kwargs or self.key in kwargs), "must give assigned 'key' or i"
        if 'i' in kwargs:
            i = kwargs['i']
            del kwargs['i']
        else:
            i = self.i(kwargs[self.key], dry=True)
        if i is None:
            return None
        got = self.lists[name][i]
        if got is None and self.op[name].load:
            got = self.op[name].load(i=i)
        return got
        
    ##################### INSTANCE BUILDING METHODS ##################### : 
    def assignKey(self, name, func):
        assert name in self.lists, "key name must first be assigned ops"
        self.key = name
        self.keyFunc = func
        self.tbl = {self.keyFunc(x): i for i,x in enumerate(self.lists[self.key]) if x is not None}
        
    def assignOperations(self, name, *, load=I, save=I, keep=I, keepSaved=None, keepClient=None):
        if keepClient is None:
            keepClient = keep
            
        #NOTE #TODE? the method names below conflict with the local vars up here, and current Python syntax takes
        # the variables up here. Should this change the code below will stop working. But this is the "nicest" way to do it
        class the(O()):
            def load(*, i, keep=True):
                if not load:
                    return None
                read = self.file.io[name].read(i=i)
                x = load(read) if read is not None else None
                if keep and self.op[name].keep:
                    self.lists[name][i] = self.op[name].keep(x, i=i)
                return x
            
            def save(x, *, i, keep=True):
                if not save:
                    return
                saved_x = save(x)
                self.file.io[name].write(saved_x, i=i)
                if keep and self.op[name].keep:
                    try:
                        self.lists[name][i] = self.op[name].keep(saved_x, i=i, saved=True)
                        assert self.lists[name][i] is not None
                    except (TypeError, AssertionError):
                        self.lists[name][i] = self.op[name].keep(x, i=i, client=True)
                
            def keep(x, *, i, saved=False, client=False):
                if x is None:
                    return None
                assert not (saved and client), "only one of `saved` and `client` can be specified"
                if not any([keep, keepSaved, keepClient]):
                    return None
                kept = None
                if saved and keepSaved:
                    kept = keepSaved(x)
                elif client and keepClient:
                    kept = keepClient(x)
                elif not saved and not client and keep:
                    kept = keep(x)
                if kept is not None:
                    self.lists[name][i] = kept
                return kept
            
        if not load:
            the.load = False
        if not save:
            the.save = False
        if not any([keep, keepSaved, keepClient]):
            the.keep = False
            
        self.op[name] = the
        if name not in self.lists:
            self.lists[name] = [None] * self.nextIndex
    
    
@staticmethod
def __IndexDataStore__from_specs(specs, **kwargs):
    for k, v in kwargs.items():
        specs[k] = v
    
    f = IndexFileSystem(specs.dir, specs.key)
    d = IndexDataStore(f, _factory_=True)
    
    _readwriteformat = {'read', 'write', 'format'}
    _loadsavekeep = {'load', 'save', 'keep', 'keepSaved', 'keepClient'}
    for name, val in dict.items(specs.op):
        f.assignIO(name, **{a: b for a,b in dict.items(val) if a in _readwriteformat})
        d.assignOperations(name, **{a: b for a,b in dict.items(val) if a in _loadsavekeep})
        
    keyFuncClient = specs.keyFuncClient if 'keyFuncClient' in specs else specs.keyFunc
    keyFunc = lambda keyVal, *, client=False: keyFuncClient(keyVal) if client else specs.keyFunc(keyVal)
    d.assignKey(specs.key, keyFunc)
    
    d.load()
    return d
IndexDataStore.from_specs = __IndexDataStore__from_specs


def json_default(o):
    if isinstance(o, np.int64):
        return int(o)  
    raise TypeError
    
class __IndexDataStore__SpecsHelper(metaclass=staticclass):
    def json(op):
        op.format = 'text'
        op.read = json.loads
        op.write = lambda x: json.dumps(x, default=json_default)
        return op
    def pickle(op):
        op.format = 'bytes'
        op.read = pickle.loads
        op.write = pickle.dumps
        return op
IndexDataStore.SpecsHelper = __IndexDataStore__SpecsHelper
IDSSH = IndexDataStore.SpecsHelper

## Model Manager

In [6]:
class ModelManager():
    features_dir = '.features'
    samples_dir = '.samples'
    models_dir = 'models'
    
    def __init__(self, directory, f_specs=None, s_specs=None, p_specs=None): #GLOBAL FDS SDS PDS
        global FDS, SDS, PDS
        if f_specs is None:
            f_specs = FDS
        if s_specs is None:
            s_specs = SDS
        if p_specs is None:
            p_specs = PDS
        self.p_specs = p_specs
        self.dir = directory if isinstance(directory, Path) else Path(directory)
        self.dir.mkdir(exist_ok=True) # no parents because this is user facing and enabling parents could get gnarly
        self.F = IndexDataStore.from_specs(f_specs, dir=self.dir/self.features_dir)
        self.S = IndexDataStore.from_specs(s_specs, dir=self.dir/self.samples_dir)
        (self.dir/self.models_dir).mkdir(exist_ok=True)
        self.load()
        
    def load(self):
        self.pms = {}
        for fi, si in self.iterIndices():
            self.make_params_manager(i=(fi, si))
        
    def iterIndices(self):
        for fdir in (self.dir/self.models_dir).iterdir():
            try:
                fi = int(fdir.name)
            except ValueError:
                continue
            assert fdir.is_dir(), "folder in models folder whose name is just a number must be a folder"
            assert fi < self.F.nextIndex, "features folder found with greater index than what has been labelled"
            for sdir in fdir.iterdir():
                try:
                    si = int(sdir.name)
                except ValueError:
                    continue
                assert sdir.is_dir(), "folder in models folder 1 layer down whose name is just a number must be a folder"
                assert si < self.S.nextIndex, "samples folder found with greater index than what has been labelled"
                yield (fi, si)
        
    def make_params_manager(self, features=None, samples=None, i=None):
        assert (
            not ((features is None) ^ (samples is None)) and
            ((features is not None and samples is not None) ^ (i is not None)),
            "invalid arguments to `make_param_manager`"
        )
        if i is None:
            i = self.F.i(features), self.S.i(samples)
        fi, si = i
        assert 0 <= fi < self.F.nextIndex and 0 <= si < self.S.nextIndex, "bad arguments to `make_params_manager`"
        pm = IndexDataStore.from_specs(self.p_specs, dir=self.dir/self.models_dir/str(fi)/str(si))
        self.pms[fi, si] = pm
        return pm
    
    def i(features, samples): # purely convenience public interface
        return self.F.i(features), self.S.i(samples)
        
    def PM(self, features, samples, get_i=False):
        fi, si = self.F.i(features), self.S.i(samples)
        if get_i:
            return self.iPM(fi, si), (fi, si)
        else:
            return self.iPM(fi, si)
        
    def iPM(self, fi, si):
        if (fi, si) not in self.pms:
            self.make_params_manager(i=(fi, si))
        return self.pms[fi, si]

## specs for managers

In [7]:
lgbNullDataset = lgb.Dataset(pd.DataFrame({'_a_': np.arange(88), '_b_': np.arange(88)}))

### Param search $\textbf{SAVING}$ specs

In [8]:
class PDS(O()):
    '''data transformation code inside `op` object:

    client --save(+write)--> disk; client --keep--> memory; disk --load(+read)--> memory

    read = text/bytes stream -> as-is object read from file
    write = object to save as is to file -> text/bytes stream
    load = as-is object read from file -> object to be loaded in memory
    save = raw object given by client -> object to save as-is to file
    keep = as-is object read from file -> object to keep in memory
    keepClient = raw object given by client -> object to keep in memory
    '''

    key = 'Params'
    keyFunc = lambda params: tuple(sorted(dict.items(params)))

    class op(O()):
        @IDSSH.json
        class Params(O()):
            '''just a dict of the parameter value assignments'''
            keepClient = dict
            
        @IDSSH.json
        class Results(O()):
            '''should be a dict-like of various things, most importantly including "score"'''
            load = lambda x: O(**x)
            save = pydict
            keepClient = lambda x: O.mycopy(x) if isinstance(x, O) else O(**x)
            
        @IDSSH.pickle
        class Training(O()):
            '''tuple (aligning with samples training/cv split tuple) of LightGBM training eval DataFrames'''
            keep = False
            
        @IDSSH.pickle
        class Boosters(O()):
            '''the actual lgb.Booster model. well, a tuple of them, one for each cv set'''
            #read = lambda file: lgb.Booster(model_file=file)
            load = lambda x: tuple(lgb.Booster(train_set=lgbNullDataset).model_from_string(s, verbose=False) for s in x)
            #write = lambda x, file: x.save_model(file)
            save = lambda x: tuple(b.model_to_string() for b in x)
            keep = False

        @IDSSH.json
        class Tags(O()):
            '''just a list of #hashtags lol jk'''
            load = frzset
            save = sorted
            keep = load

### feature selection / days sampling $\textbf{SAVING}$ specs

In [9]:
class FDS(O()):
    key = 'Feats'
    keyFunc = lambda features: features # internally already coverted to immutable
    keyFuncClient = frzset
    class op(O()):
        '''container of feature name strings'''
        @IDSSH.json
        class Feats(O()):
            load = frozenset
            save = sorted
            keepClient = frozenset
        @IDSSH.json
        class Ctor(O()):
            '''should be just a string, the query string of the FFF object'''
    
class SDS(O()):
    key = 'Samps'
    keyFunc = lambda samples: samples # internally already converted immutable
    keyFuncClient = lambda samples: frzset((frzset(tr), frzset(cv)) for tr, cv in samples)
    class op(O()):
        @IDSSH.json
        class Samps(O()):
            '''tuple (num samples) of 2-tuples of tr/cv containers of the canonical index values for the split'''
            load = lambda samples: frzset((frzset(tr), frzset(cv)) for tr, cv in samples)
            save = lambda samples: sorted([sorted(tr), sorted(cv)] for tr, cv in samples)
            keepClient = load
        @IDSSH.json
        class Ctor(O()):
            '''should be container = {tr, cv}, where tr and cv are containers of the "quarter" value in the resp. data set'''
            #'''should be O specs object'''
            load = lambda ctor: frzset(frzset(s) for s in ctor)
            save = lambda ctor: sorted(sorted(s) for s in ctor)
            keepClient = load

# ~!~!~!~!~!~! Model (features/samples/parameters) Search ~!~!~!~!~!~!~!

## parameter search logic

In [10]:
class ParamSearch():
    mix = staticmethod(logavg)
    
    def __init__(self, specs):
        self.specs = O.mycopy(specs)
        self.setup_specs()
        
    def setup_specs(self):
        dsc = self.specs.Discrete
        dsc.keys = list(flatten( dsc.enc.keys() ))
        dsc.assigns = [list(flatten(x)) for x in product(* dsc.enc.values() )]
        obo = self.specs.OneByOne
        for k,v in dict.items(obo.data):
            for i,x in dict.items(obo.default):
                if i not in v:
                    v[i] = x
                elif isinstance(v[i], LambdaType):
                    v[i] = v[i](x)
    
    def search(self):
        dsc, obo = self.specs.Discrete, self.specs.OneByOne
        for assign in dsc.assigns:
            params = dict(zip(dsc.keys, assign))
            coroutine = self.one_by_one()
            for addon in coroutine:
                params.update(addon)
                coroutine.send((yield copy.deepcopy(params))); assert (yield) is None
        
        
    def one_by_one(self):
        obod = self.specs.OneByOne.data
        
        #! main algorithm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        
        # initialize loop variables
        params = {k: (v.b if 'b' in v else v.a[1]) for k,v in dict.items(obod)}
        ranges = {k: v.a for k,v in dict.items(obod)}
        scores = {k: [-np.inf, -np.inf] for k in params}
        isdone = {k: False for k in params}
        
        # pre loop one-off work
        base_score = yield params; assert (yield) is None
        
        # loop
        for i in range(999_999_999):
            #! try new parameter values ############################## part A of loop work
            
            # initialize local loop variable
            new_scores = {k: [-np.inf, -np.inf] for k in params}
            new_params = {k: ( obod[k].cast(self.mix(v[0], v[1])),
                               obod[k].cast(self.mix(v[1], v[2])) ) for k,v in dict.items(ranges)}
            
            # finish condition check
            isdone = {k: v or i>=obod[k].lim for k,v in dict.items(isdone)}
            if all(isdone.values()):
                break
                
            # try new parameter values for all parameters
            for key in list(params):
                if i >= obod[key].lim:
                    continue
                orig = params[key]
                params[key] = new_params[key][0]
                scores[key][0] = yield params; assert (yield) is None
                params[key] = new_params[key][1]
                scores[key][1] = yield params; assert (yield) is None
                params[key] = orig
            
            #! start setting up values for next loop ######################## part B of loop work
            
            # set params to the best found and see if it betters score, updating ranges also
            #CODE num_nochange = 0
            for key in list(params):
                if scores[key][0] > base_score and scores[key][0] >= scores[key][1]:
                    params[key] = new_params[key][0]
                    ranges[key] = [ranges[key][0], params[key], ranges[key][1]]
                elif scores[key][1] > base_score and scores[key][1] >= scores[key][0]:
                    params[key] = new_params[key][1]
                    ranges[key] = [ranges[key][1], params[key], ranges[key][2]]
                else:
                    ranges[key] = [new_params[key][0], ranges[key][1], new_params[key][1]]
                    #num_nochange += 1
                
            # send out new params
            #CODE if num_nochange < len(params):
            base_score = yield params; assert (yield) is None

## Features / Samples(train/cv split) search logic

#### the Kaggle metric

In [11]:
class KaggleMetric():
    def __init__(self, incr=0):
        self.incr = incr
    
    def attach(self, ms):
        L, s = ms._L, ms._s
        for Ltr, Lcv, tr, cv in zip(L.tr, L.cv, s.tr, s.cv):
            Ltr.timeFactor = ms.Y.time[tr].factorize()[0]
            Lcv.timeFactor = ms.Y.time[cv].factorize()[0]
            Ltr.value = (ms.Y.upDown1*ms.Y.absVal1)[tr]
            Lcv.value = (ms.Y.upDown1*ms.Y.absVal1)[cv]
            Ltr.i = 0
            Lcv.i = 0
    
    def __call__(self, preds, valid_data):
        df_time = valid_data.timeFactor
        #labels = valid_data.get_label()
        values = valid_data.value
        #assert len(labels) == len(df_time)

        preds = preds*2-1
        #labels = labels*2-1
        x_t = preds * values

        # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
        # is a pd.Series and call `group_by`
        x_t_sum = x_t.groupby(df_time).sum()
        score = x_t_sum.mean() / x_t_sum.std()

        valid_data.i += self.incr
        return 'kaggle', score+valid_data.i, True

### Model searching logic

In [12]:
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
        
class ModelSearch():
    
    def __init__(self, specs, mm, *, X, Y, log=None, tag=None, tags=None):
        self.specs = specs
        global LOG
        self.log = log if log is not None else LOG
        assert tag is None or tags is None
        if tags is None:
            tags = [tag]
        self.tags = frzset(tags)
        self.mm = mm
        assert (X.index == Y.index).all()
        self.X = X
        self.Y = Y
        
#     def setup_specs(self):
#         self.specs.Samples.data = []
#         for code in self.specs.Samples.enc:
#             if code.method == 'group.2':
#                 tr, cv = set(code.data[0]), set(code.data[1])
#                 tr, cv = self.Y[code.groups].isin(tr), self.Y[code.groups].isin(cv)
#                 tr, cv = self.Y.index[tr], self.Y.index[cv]
#                 self.specs.Samples.data += [((tr, cv), (cv, tr))]
#             elif code.method == 'GroupShuffleSplit.2':
#                 tr, cv = next(GroupShuffleSplit(**code.kwargs).split(self.X, self.Y, groups=self.Y[code.groups]))
#                 tr, cv = self.Y.index[tr], self.Y.index[cv]
#                 self.specs.Samples.data += [((tr, cv), (cv, tr))]
# #             elif code.method == 'GroupKFold':
# #                 self.specs.Samples.data.append(tuple(GroupKFold(**code.kwargs)
# #                                                      .split(self.X, self.Y, groups=self.Y[code.groups])))
#             else:
#                 assert False, f'sampling method "{code.method}" not implemented'
        
    def iter_feats_samps(self):
        for t in range(99999999):
            featsObj = self.specs.Features.func(t)
            sampsObj = self.specs.Samples.func(t)
            if featsObj is None or sampsObj is None:
                break
            fi = self.mm.F.save(**featsObj)
            si = self.mm.S.save(**sampsObj)
            yield (fi, featsObj['Feats']), (si, sampsObj['Samps'])
        
    def walk(self):
        for (fi, feats), (si, samps) in self.iter_feats_samps():
            best_score = -np.inf
            pm = self.mm.iPM(fi, si)
            self._pm = pm
            
            self.setup_training(feats, samps, i=(fi,si))
            search = ParamSearch(self.specs.search)
            loop = search.search()
            for params in loop: # params are deepcopied out, so can safely save them as-is!
                paramsUse = dict(**self.specs.Params.data, **params)
                pi = pm.i(params)
                results = pm.get('Results', i=pi)
                if results:
                    if results['score'] > best_score:
                        self._save = O(Results=results)
                        self._best = (dict(**params), O(**self._save)) #TODO I still copy params here to be safe (needed)?
                else:
                    self.log('training...')
                    self.train(paramsUse) # sets some state attributes in self: self._save
                    results = self._save.Results
                    if results['score'] > best_score:
                        self._best = (dict(**params), O(**self._save)) #TODO I still copy params here to be safe (needed)?
                    del self._save.Training, self._save.Boosters
                    already_tags = pm.get('Tags', i=pi)
                    already_tags = already_tags if already_tags is not None else frozenset()
                    pm.save(i=pi, Tags=already_tags|self.tags, **self._save)
                loop.send(results['score'])
                yield (fi, si), results
                #del self._save #TODO probably wanna uncomment this in production
            pm.save(Params=self._best[0], **self._best[1])
            
    def run(self):
        for _ in self.walk():
            pass
        
    def setup_training(self, feats, samps, *, i=None):
        SEE('='*90, 'setup_training')
        SEE(feats)
        SEE(samps)
#         samps = self.mm.S.get('Samps', i=i[1]) # this `samps` is a tuple of tuple of frozensets
        _X = self.X[feats]
        #_dummy = pd.Series(range(len(_X)), index=_X.index)
        _s = self._s = O()
        _s.tr, _s.cv = tuple(_X.index.isin(s[0]) for s in samps), tuple(_X.index.isin(s[1]) for s in samps)
        lgb_data_info = dict(
            feature_name = list(_X.columns),
            categorical_feature = list(_X.dtypes[_X.dtypes.isin([np.int64,np.int32])].index),
            free_raw_data = False,
        )
        _L = self._L = O()
        _L.tr = [lgb.Dataset(_X[tr], P[self.specs.model.target][tr], **lgb_data_info,
                            **({'weight': P[self.specs.model.weight][tr]} if 'weight' in self.specs.model else {}))
                for tr in _s.tr]
        _L.cv = [lgb.Dataset(_X[cv], P[self.specs.model.target][cv], reference=Ltr, **lgb_data_info,
                            **({'weight': P[self.specs.model.weight][cv]} if 'weight' in self.specs.model else {}))
                for cv, Ltr in zip(_s.cv, _L.tr)]
        m = self.specs.metric
        if hasattr(m, 'attach'):
            m.attach(self)
        #TODO implement both logloss and Kaggle metric, and stop only when both don't improve in whatever num rounds
    
    def train(self, params):
        def iter_samples():
            for Ltr, Lcv in zip(self._L.tr, self._L.cv):
                evals_result = {}
                # fucking LightGBM deletes 'num_iterations' from `params` after training, like WTF???
                bst = lgb.train(dict(params), Ltr, valid_sets=[Ltr, Lcv], valid_names=['tr', 'cv'],
                          feval=self.specs.metric, evals_result=evals_result, verbose_eval=10)
                df_results = (pd.DataFrame(evals_result['tr']), pd.DataFrame(evals_result['cv']))
                yield bst, df_results
        bsts, dfs = zip(*iter_samples())
        
        class save(O()):
            Training = dfs
            Boosters = bsts
            class Results(O()):
                train = tuple(dft[0].iloc[-1-params['early_stopping_round'], 0] for dft in dfs)
                scores = tuple(dft[1].iloc[-1-params['early_stopping_round'], 0] for dft in dfs)
                score = sum(scores) / len(scores)
                nboost = tuple(len(dft[1])-params['early_stopping_round'] for dft in dfs)
        self._save = save

## specs for search logic

### params searching specs

In [13]:
class LPS(O()):
    class Discrete(O()):
        #TODO add hook for stopping short if Results['score'] is not good enough `stop` = <discrete index> -> <score thresh>
        stop = {3: .5, 7: .6}
        #TODO add hook for changing OneByOne options; may have to break setup_specs into two methods
        enc = {
            'learning_rate': [.05],
            ('max_depth','num_leaves'):
                [(8,1<<8),(10,1<<10),(12,1<<12),
                 (6,1<<6),(-1,1<<10),(10,1<<8),(-1,1<<14),
                 (10,1<<6),(12,1<<10),(8,1<<6),(-1,1<<12),(12,1<<8)]
            #('max','num'): #10
            #    [(6,6),(9,7),(9,9),(12,8),(12,10),(12,12),(-1,8),(-1,10),(-1,12),(-1,14)],
        }
        
    class OneByOne(O()):
        class info(O()):
            a = "main [a]rray data of 3 values, [min mid max]"
            b =  "[b]ack up value, i.e. default value if array doesn't give better results"
            cast = "function to apply to values before using to cast to the right dtype"
            lim = "maximum number of iterations of searching in this hyperparameter"
        class default(O()):
            cast = keepSigFig(2)
            lim = 0
        data = { #TODO implement 1 sided search e.g. len(a)==2
            'min_data_in_leaf': O(a=[1,60,375], cast=round, lim=lambda lim: lim+1),
            'min_sum_hessian_in_leaf': O(a=[0,50,200], b=1e-3, lim=lambda lim: lim+1),
            'lambda_l1': O(a=[0,.02,.2], b=0),
            'lambda_l2': O(a=[0,.02,.2], b=0),
        }

### model search specs

load the data

In [15]:
#F,P = pd.read_pickle('/big/data/saves/train_8fixedsince.32.pkl')

done loading the data

In [16]:
from features import gen_features
from sklearn.model_selection import GroupShuffleSplit

ho = P.quarter>=2015.5
ho.name = None


class LMS(O()):
    model = O(
        time = 'time',
        value = 'y',
        target = 'target',
        weight = 'weight1',
    )
    
    metric = KaggleMetric()
    search = LPS
    
    
    class Features(metaclass=staticclass):
        '''features selection groups'''
        num_feats = 10
        num_samps = 1
        
        def func(i):
            num_feats, num_samps = __class__.num_feats, __class__.num_samps
            if i >= num_feats * num_samps:
                return None
            f, q = gen_features(i // num_samps)
            return O(Feats=f, Ctor=q)


    class Samples(metaclass=staticclass):
        '''sample learning/cv split'''
        # O(method='GroupShuffleSplit.2', kwargs=dict(n_splits=<(5)many>, test_size=.5, random_state=44), groups='quarter'),
        def func(i):
            group = P.quarter[~ho]
            h = frozenset(group)
            g = frozenset(random.Random(i).sample(h, len(h)//2))
            tr, cv = group.isin(g), group.isin(h-g)
            tr, cv = [F.index[~ho][trcv] for trcv in [tr,cv]]
            return O(Samps=((tr, cv),(cv,tr)), Ctor=(g,h-g))

        
    class Params(O()):
        '''parameters constant settings'''
        data = dict(
            objective = 'binary',
            num_iterations = 100000,
            early_stopping_round = 50,
            metric = 'None',
            seed = 44,
            bagging_seed = 45,
            feature_fraction_seed = 46,
        )

# LIVE TESTING WALK ONLY

In [17]:
#SEE = lambda*a,**k: print('>>>', *a, **k)
SEE = lambda*a,**k:None

In [18]:
mm = ModelManager('/big/data/search/fake')
ms = ModelSearch(specs=LMS, mm=mm, X=F, Y=P)
walk = ms.walk()

In [19]:
t = -1

In [20]:
%%time
t += 1
(fi, si), res = next(walk)
print(t, (fi, si), res)
feats, fenc = mm.F.get('Feats', i=fi), mm.F.get('Ctor', i=fi)
print(len(feats), '\n', fenc)

training...
Training until validation scores don't improve for 50 rounds.
[10]	tr's kaggle: 0.478446	cv's kaggle: 0.241584
[20]	tr's kaggle: 0.521434	cv's kaggle: 0.248347
[30]	tr's kaggle: 0.559698	cv's kaggle: 0.253076
[40]	tr's kaggle: 0.58351	cv's kaggle: 0.261061
[50]	tr's kaggle: 0.607135	cv's kaggle: 0.266912
[60]	tr's kaggle: 0.635107	cv's kaggle: 0.26899
[70]	tr's kaggle: 0.662768	cv's kaggle: 0.272346
[80]	tr's kaggle: 0.678271	cv's kaggle: 0.275649
[90]	tr's kaggle: 0.688869	cv's kaggle: 0.275518
[100]	tr's kaggle: 0.704552	cv's kaggle: 0.274296
[110]	tr's kaggle: 0.722254	cv's kaggle: 0.276545
[120]	tr's kaggle: 0.748133	cv's kaggle: 0.276875
[130]	tr's kaggle: 0.760739	cv's kaggle: 0.276409
[140]	tr's kaggle: 0.773453	cv's kaggle: 0.276916
[150]	tr's kaggle: 0.785183	cv's kaggle: 0.277278
[160]	tr's kaggle: 0.793402	cv's kaggle: 0.279065
[170]	tr's kaggle: 0.80646	cv's kaggle: 0.279397
[180]	tr's kaggle: 0.825104	cv's kaggle: 0.278993
[190]	tr's kaggle: 0.829454	cv's kaggl

# ================= ACTUAL PRODUCTION RUN ====================

# ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, Live Run Testing (still, just testing) ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

### making test variables

In [104]:
from scipy.stats import norm
ppf = norm.ppf
eps = np.finfo(float).eps

In [105]:
def make_new_vars():
    n = 1000
    ##################RANDOMIZE#######################
    while True:
        try:
            u = np.random.uniform(size=(5, n))
            assert (u!=0).all()
            break
        except AssertionError:
            continue
    uu = u.copy()
    uu[0][np.random.randint(252, size=n) == 0] = np.nan
    uu[1][np.random.randint(173, size=n) == 0] = np.nan
    uu[2][np.random.randint(81, size=n) == 0] = np.nan
    uu[3][np.random.randint(27, size=n) == 0] = np.nan
    #################ENDRANDOM######################
    F = pd.DataFrame({'hax': np.arange(n)})
    F['time'] = F.hax // 10
    F['assetCodeId'] = F.hax % 10
    F['quarter'] = F.hax // 100
    P = F.copy()
    F['alex'] = ppf(u[0]*5%.99+eps)*ppf(u[1]*7%.99+eps)
    F['bob'] = ppf(.5+.49*uu[2]*np.sin(F.assetCodeId.values**10))*np.exp(uu[3])
    F['carol'] = -np.log(u[0]) + 4 * np.sin(1/uu[1])
    F['dean'] = u[1] + 2 * uu[2] - np.exp(-uu[3])
    F['edgar'] = ppf( (uu[2]**2-2*u[1]+(u[0]-u[3])**3-.5*u[1]**2) % .99 + eps )
    P['y'] = (ppf((u[0]+u[1])/2) + ppf((u[2]+u[3])/2) + .2*ppf(u[4])) * 1e-1
    P['universe'] = (~np.isnan(uu[0])).astype(float)
    return F, P

In [106]:
#F, P = make_new_vars()
#Path('lighttest.pkl').write_bytes(pickle.dumps((F, P)))
F, P = pickle.loads(Path('lighttest.pkl').read_bytes())

In [107]:
P['target'] = P.y>0
P['upDown'] = (P.target*2-1)
P['upDown1'] = P.upDown*P.universe.astype(int)
P['absVal'] = np.abs(P.y)
P['absVal1'] = P.absVal*P.universe
P['weight'] = P.absVal#.qtl()
P['weight1'] = P.weight*P.universe

In [108]:
F.head()

Unnamed: 0,hax,time,assetCodeId,quarter,alex,bob,carol,dean,edgar
0,0,0,0,0,-0.012198,0.0,3.655924,1.3362,0.601672
1,1,0,1,0,-0.033782,1.139956,8.008799,1.244428,0.349115
2,2,0,2,0,0.034335,-0.333275,2.013295,1.549419,1.363992
3,3,0,3,0,-2.131567,-0.049046,5.325198,0.450859,0.940187
4,4,0,4,0,1.636082,0.267445,4.698875,0.670703,-0.449314


In [109]:
P.head()

Unnamed: 0,hax,time,assetCodeId,quarter,y,universe,target,upDown,upDown1,absVal,absVal1,weight,weight1
0,0,0,0,0,0.076738,1.0,True,1,1,0.076738,0.076738,0.076738,0.076738
1,1,0,1,0,0.04821,1.0,True,1,1,0.04821,0.04821,0.04821,0.04821
2,2,0,2,0,0.089397,1.0,True,1,1,0.089397,0.089397,0.089397,0.089397
3,3,0,3,0,-0.028319,1.0,False,-1,-1,0.028319,0.028319,0.028319,0.028319
4,4,0,4,0,-0.103184,1.0,False,-1,-1,0.103184,0.103184,0.103184,0.103184


In [110]:
class LPS(O()):
    class Discrete(O()):
        enc = {
            'learning_rate': [.05],
            ('max_depth','num_leaves'):
                [(3,1<<3),(6,1<<6),(-1,1<<9)]
        }
        
    class OneByOne(O()):
        class info(O()):
            a = "main [a]rray data of 3 values, [min mid max]"
            b =  "[b]ack up value, i.e. default value if array doesn't give better results"
            cast = "function to apply to values before using to cast to the right dtype"
            lim = "maximum number of iterations of searching in this hyperparameter"
        class default(O()):
            cast = keepSigFig(2)
            lim = 0
        data = {
            'min_data_in_leaf': O(a=[1,6,37], cast=round, lim=1),
            'min_sum_hessian_in_leaf': O(a=[0,5,20], b=0),
            'lambda_l1': O(a=[0,.02,.2], b=0),
            'lambda_l2': O(a=[0,.02,.2], b=0),
        }


class LMS(O()):
    model = O(
        time = 'time',
        value = 'y',
        target = 'target',
        weight = 'weight1',
    )
    
    metric = KaggleMetric()
    search = LPS
    
    
    class Features(metaclass=staticclass):
        '''features selection groups'''
        num_feats = 3
        num_samps = 3
        data = [
            ['alex', 'bob', 'carol'],
            ['bob', 'dean', 'edgar'],
            ['alex', 'carol', 'edgar']
        ]
        def func(i):
            num_feats, num_samps = __class__.num_feats, __class__.num_samps
            if i >= num_feats * num_samps:
                return None
            return O(Feats=__class__.data[i // num_samps])
    
    ho = P.quarter >= 8
    
    class Samples(metaclass=staticclass):
        '''sample learning/cv split'''
#         enc = [
#             O(method='group.2', groups='quarter', data=[
#                 [0, 1, 2, 3, 4],
#                 [5, 6, 7, 8, 9]
#             ]),
#         ]
        # O(method='GroupShuffleSplit.2', kwargs=dict(n_splits=<(5)many>, test_size=.5, random_state=44), groups='quarter'),
        def func(i):
            group = P.quarter[~ho]
            h = frzset(group)
            g = frzset(random.Random(i).sample(h,len(h)//2))
            tr, cv = group.isin(g), group.isin(h-g)
            tr, cv = [F.index[~ho][trcv] for trcv in [tr,cv]]
            return O(Samps=((tr, cv),(cv,tr)), Ctor=(g,h-g))
        
    class Params(O()):
        '''parameters constant settings'''
        data = dict(
            objective = 'binary',
            num_iterations = 100000,
            early_stopping_round = 50,
            metric = 'None',
            seed = 44,
            bagging_seed = 45,
            feature_fraction_seed = 46,
        )

In [115]:
mm = ModelManager('/big/data/search/test')
ms = ModelSearch(specs=LMS, mm=mm, X=F, Y=P)
walk = ms.walk()

In [116]:
ii = -1

In [122]:
ii += 1
print((ii,) + next(walk))

training...
(5, 0, <>(train=(3.0641656542681162, 2.8859059922554424), scores=(1.884929332514137, 1.598141190320219), score=1.741535261417178, nboost=(104, 22)))


# -.-.-.-.-.-.-.-.-.-.-.-.-.- old shit / testing -.-.-.-.-.-.-.-.-.-.-.-.-.-.-

In [105]:
# PUREL TESTING
class HI(O()):
    class DERP():
        val = -99
        dir = Path('.')
        def yo(x):
            def y(a):
                return x(a) + 1000
            return y
        my = O(what=yo(lambda a: a ** 2))
        hm = O(func=lambda x: x * __class__.val, goto=lambda x: __class__.dir/x)
        class WHAT(O()):
            dude = lambda x: x + 10
        class YES(O()):
            dude = WHAT.dude
HI.DERP.my.what(5)
HI.DERP.hm.func(5)
HI.DERP.dir = Path('what')
HI.DERP.hm.goto('.git')
HI.DERP.WHAT.dude(0)

# ANOTHER PURE TESTING
def dothe(self):
    self.val = 4
    top = 100
    class Derp(O()):
        def func(x):
            return x + self.val
        def top(y):
            return y * top
    self.derp = Derp
item = Stop()
dothe(item)
item.derp.top(3)

class Yo(O()):
    class What(O()):
        derp = 5
    class Another(O()):
        yep = What.derp
Yo()

NameError: name 'What' is not defined