In [1]:
from common import *
from pathlib import Path
import json, copy

In [2]:
I = lambda x: x
logavg = lambda x,y: math.expm1((math.log1p(x)+math.log1p(y))/2)
keepSigFig = lambda n: lambda x: round(x, -int(math.floor(math.log10(abs(x)))) + (n - 1)) if x else x

# #!#!#!#!#!#!#!#!#!#!#! Save System #!#!#!#!#!#!#!#!#!#!#!

### File System

In [3]:
class IndexFileSystem():
    def __init__(self, directory, key):
        self.dir = directory if isinstance(directory, Path) else Path(directory)
        self.dir.mkdir(exist_ok=True, parents=True)
        assert isinstance(key, str), "key must be string"
        self.key = key
        self.io = O()

    def iterIndices(self):
        for dot_params in self.dir.glob('*.' + self.key):
            yield int(dot_params.stem)

    def getFilePath(self, *, i):
        return self.dir / (str(i) + '.' + name)

    @staticmethod
    def readWrapper(read):
        '''wraps io read operations to safely return None if file does not exist'''
        @wraps(read)
        def read_safely(*a, **k):
            try:
                return read(*a, **k)
            except FileNotFoundError:
                return None
        return read_safely

    def assignIO(self, name, *, read, write, format='custom'):
        assert format in ['bytes', 'text', 'custom'], "argument `format` must be one of 'bytes' or 'text'"
        if format in ['bytes', 'text']:
            self.io[name] = O(
                read = readWrapper( lambda *,i: read(getattr(self.getFilePath(name, i=i), 'read_'+format)()) ),
                write = lambda x,*,i: getattr(self.getFilePath(name, i=i), 'write_'+format)(write(x))
            )
        elif format == 'custom':
            def readCustom(*, i):
                file = self.getFilePath(name, i=i)
                if not file.exists():
                    return None
                try:
                    return read(file)
                except Exception:
                    return read(str(file))
            def writeCustom(*, i):
                file = self.getFilePath(name, i=i)
                try:
                    write(x, file=file)
                except Exception:
                    write(x, file=str(file))
            self.io[name] = O(read=readCustom, write=writeCustom)

class IndexDataStore():    
    def __init__(self, file, _factory_=False):
        if not _factory_:
            assert False, "Cannot instantiate IndexDataStore normally. Please use factory static method."
        self.file = file
        self.op = O()
        self.lists = O()
        self.key = None
        self.keyFunc = lambda keyVal, *, client=False: object()
        self.tbl = {}
        self.nextIndex = 0
        
    def load(self):
        indices = sorted(self.file.iterIndices())
        n = self.nextIndex = max(indices) + 1
        for name, _ in dict.items(self.lists):
            self.lists[name] = [None] * self.nextIndex
        
        for i in indices:
            for name in self.lists:
                if self.op[name].load:
                    self.op[name].load(i)
                
        if self.key is not None:
            self.assignKey(self.key, self.keyFunc)
            
    def i(self, key, client=True, dry=False):
        keyVal = self.keyFunc(key, client=client)
        
        if keyVal not in self.tbl and not dry:
            for name in self.lists:
                assert len(self.lists[name]) == self.nextIndex, "Internal error"
                self.lists[name].append(None)
            self.tbl[keyVal] = self.nextIndex
            self.nextIndex += 1
        elif keyVal not in self.tbl and dry:
            return None
        
        return self.tbl[keyVal]
            
    def save(self, keep=True, **kwargs):
        assert ('i' in kwargs) ^ (self.key in kwargs), "call to `save` must include exactly one of i= or the key name ="
        if 'i' in kwargs:
            i = kwargs['i']
            del kwargs['i']
        else:
            i = self.i(kwargs[self.key])
        
        for name in kwargs:
            if name not in self.op:
                raise AssertionError(f"given save item '{name}' does not have data store ops initialized")
            if self.op[name].save:
                self.op[name].save(kwargs[name], i=i, keep=keep)
            
    def get(self, name, **kwargs):
        assert len(kwargs) == 1 and ('i' in kwargs or self.key in kwargs), "must give assigned 'key' or i"
        if 'i' in kwargs:
            i = kwargs['i']
            del kwargs['i']
        else:
            i = self.i(kwargs[self.key], dry=True)
        if i is None:
            return None
        return self.lists[name][i]
        
    ##################### INSTANCE BUILDING METHODS ##################### : 
    def assignKey(self, name, func):
        assert name is not None, "name to be used as key cannot be None"
        self.key = name
        self.keyFunc = func
        self.tbl = {self.keyFunc(x): i for i,x in enumerate(self.lists[self.key]) if x is not None}
        
    def assignOperations(self, name, *, load=I, save=I, keep=I, keepSave=None, keepClient=None):
        if keepClient = None:
            keepClient = keep
            
        class the(O()):
            def load(*, i, keep=True):
                x = load(self.file.io[name].read(i=i)) if load else None
                if self.op[name].keep:
                    self.lists[name][i] = self.op[name].keep(x)
                return x
            
            def save(x, *, i, keep=True):
                saved_x = save(x)
                self.file.io[name].write(saved_x) if save else None
                if self.op[name].keep:
                    try:
                        self.lists[name][i] = self.op[name].keep(saved_x, save=True)
                    except (TypeError, AssertionError):
                        self.lists[name][i] = self.op[name].keep(x, client=True)
                
            def keep(x, *, i, save=False, client=False):
                assert not (save and client), "only one of `save` and `client` can be specified"
                kept = None
                if save:
                    assert keepSave, "`keepSave` must have been specifically given to use the `save=True` flag in `keep`"
                    kept = keepSave(x)
                elif client:
                    kept = keepClient(x)
                else
                    kept = keep(x)
                if kept is not None:
                    self.lists[name][i] = kept
                return kept
            
        if not load:
            the.load = False
        if not save:
            the.save = False
        if not keep:
            the.keep = False
            
        self.op[name] = the
        if keep and name not in self.lists:
            self.lists[name] = [None] * self.nextIndex
    
    
@staticmethod
def __IndexDataStore__from_specs(specs, **kwargs):
    for k, v in kwargs.items():
        specs[k] = v
    f = IndexFileSystem(specs.dir, specs.key)
    d = IndexDataStore(f, _factory_=True)
    keyFuncClient = specs.keyFuncClient if 'keyFuncClient' in specs else specs.keyFunc
    keyFunc = lambda keyVal, *, client=False: specs.keyFuncClient(keyVal) if client else specs.keyFunc(keyVal)
    d.assignKey(specs.key, keyFunc)
    _readwriteformat = {'read', 'write', 'format'}
    _loadsavekeep = {'load', 'save', 'keep', 'keepSave', 'keepClient'}
    for name, val in dict.items(specs):
        f.assignIO(name, **{a: b for a,b in dict.items(val) if a in _readwriteformat})
        d.assignOperations(name, **{a: b for a,b in dict.items(val) if a in _loadsavekeep})
    d.load()
IndexDataStore.from_specs = __IndexDataStore__from_specs


class __IndexDataStore__SpecsHelper(metaclass=staticclass):
    def json(op):
        op.format = 'text'
        op.read = json.loads
        op.write = json.dumps
        return op
    def pickle(op):
        op.format = 'bytes'
        op.read = pickle.loads
        op.write = pickle.dumps
        return op
IndexDataStore.SpecsHelper = __IndexDataStore__SpecsHelper
IDSSH = IndexDataStore.SpecsHelper

SyntaxError: invalid syntax (<ipython-input-3-0839842a4234>, line 124)

### Model Manager

In [None]:
class ModelManager():
    features_dir = '.features'
    samples_dir = '.samples'
    models_dir = 'models'
    
    def __init__(self, directory):
        self.dir = directory if isinstance(directory, Path) else Path(directory)
        self.dir.mkdir(exist_ok=True)
        self.F = IndexDataStore.from_specs(FDS, dir=self.dir/self.features_dir)
        self.S = IndexDataStore.from_specs(SDS, dir=self.dir/self.samples_dir)
        (self.dir/self.models_dir).mkdir(exist_ok=True)
        self.load()
        
    def load(self):
        self.pms = {}
        for fi, si in self.iterIndices():
            self.make_params_manager(i=(fi, si))
        
    def iterIndices(self):
        for fdir in (self.dir/self.models_dir).iterdir():
            try:
                fi = int(fdir.name)
            except ValueError:
                continue
            assert fdir.is_dir(), "folder in models folder whose name is just a number must be a folder"
            assert fi < self.F.nextIndex, "features folder found with greater index than I have labelled"
            for sdir in fdir.iterdir():
                try:
                    si = int(sdir.name)
                except ValueError:
                    continue
                assert sdir.is_dir(), "folder in models folder 1 layer down whose name is just a number must be a folder"
                assert si < self.S.nextIndex, "samples folder found with greater index than I have labelled"
                yield (fi, si)
        
    def make_params_manager(self, features=None, samples=None, i=None):
        assert features is not None and samples is not None or i is not None, "invalid arguments to `make_param_manager`"
        if i is None:
            fi, si = self.F.i(features), self.S.i(samples)
        fi, si = i if i is not None else (fi, si)
        pm = IndexDataStore.from_specs(PDS, dir=self.dir/self.models_dir/str(fi)/str(si))
        self.pms[fi, si] = pm
        return pm
    
    def i(features, samples): # purely convenience public interface
        return self.F.i(features), self.S.i(samples)
        
    def PM(self, features, samples):
        fi, si = self.F.i(features), self.S.i(samples)
        if (fi, si) not in self.pms:
            self.make_params_manager(i=(fi, si))
        return self.pms[fi, si]
        
    def iPM(self, fi, si):
        if (fi, si) not in self.pms:
            self.make_params_manager(i=(fi, si))
        return self.pms[fi, si]

### SPECS for managers

In [None]:
lgbNullDataset = lgb.Dataset(pd.DataFrame({'_a_': np.arange(100), '_b_': np.arange(100)}))

In [None]:
class PDS(O()):
    '''data transformation code inside `op` object:

    client --save(+write)--> disk; client --keep--> memory; disk --load(+read)--> memory

    read = text/bytes stream -> as-is object read from file
    write = object to save as is to file -> text/bytes stream
    load = as-is object read from file -> object to be loaded in memory
    save = raw object given by client -> object to save as-is to file
    keep = as-is object read from file -> object to keep in memory
    keepClient = raw object given by client -> object to keep in memory
    '''
    

    key = 'Params'
    keyFunc = lambda params: tuple(sorted(dict.items(params)))

    class op(O()):
        @IDSSH.json
        class Params(O()):
            '''just a dict of the parameter value assignments'''
            keepClient = dict
            
        @IDSSH.pickle
        class Results(O()):
            '''should be a dict-like of various things, most importantly including "score"'''
            pass
            
        @IDSSH.pickle
        class Training(O()):
            '''tuple (aligning with samples training/cv split tuple) of LightGBM training eval DataFrames'''
            keep = False
            
        @IDSSH.pickle
        class Booster(O()):
            '''the actual lgb.Booster model. well, a tuple of them, one for each cv set'''
            #read = lambda file: lgb.Booster(model_file=file)
            read = lambda x: tuple(lgb.Booster(train_set=lgbNullDataset).model_from_string(s, verbose=False) for s in x)
            #write = lambda x, file: x.save_model(file)
            write = lambda x: tuple(b.model_to_string() for b in x)
            keep = False

In [None]:
class FDS(O()):
    key = 'Feats'
    keyFunc = lambda features: features
    keyFuncClient = lambda features: frozenset(features)
    class op(O()):
        @IDSSH.json
        class Feats(O()):
            load = frozenset
            save = sorted
            keep = load
    
class SDS(O()):
    key = 'Samps'
    keyFunc = lambda samples: frozenset(frozenset(a) for a in samples)
    class op(O()):
        @IDSSH.json
        class Samps(O()):
            load = tuple
            save = lambda samples: [sorted(a) for a in samples]
            keepSave = tuple
            keepClient = lambda samples: tuple(sorted(a) for a in samples)
        @IDSSH.json
        class Ctor(O()):
            keepClient = lambda code: copy.deepcopy(dict(**code))

# ~!~!~!~!~!~! Model (features/samples/parameters) Searching ~!~!~!~!~!~!~!

### parameter search logic

In [4]:
class ParamSearch():
    mix = staticmethod(logavg)
    
    def __init__(self, specs):
        self.specs = O.mycopy(specs)
        self.setup_specs()
        
    def setup_specs(self):
        dsc = self.specs.Discrete
        dsc.keys = list(flatten( dsc.enc.keys() ))
        dsc.assigns = [list(flatten(x)) for x in product(* dsc.enc.values() )]
        obo = self.specs.OneByOne
        for k,v in dict.items(obo.data):
            for i,x in dict.items(obo.default):
                if i not in v:
                    v[i] = x
    
    def search(self):
        dsc, obo = self.specs.Discrete, self.specs.OneByOne
        for assign in dsc.assigns:
            params = dict(zip(dsc.keys, assign))
            coroutine = self.one_by_one()
            for addon in coroutine:
                params.update(addon)
                coroutine.send((yield copy.deepcopy(params))); assert (yield) == None
        
        
    def one_by_one(self):
        obod = self.specs.OneByOne.data
        
        #! main algorithm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        
        # initialize loop variables
        params = {k: (v.b if 'b' in v else v.a[1]) for k,v in dict.items(obod)}
        ranges = {k: v.a for k,v in dict.items(obod)}
        scores = {k: [-np.inf, -np.inf] for k in params}
        isdone = {k: False for k in params}
        
        # pre loop one-off work
        base_score = yield params; assert (yield) == None
        
        # loop
        for i in range(9999999999):
            #! try new parameter values ############################## part A of loop work
            
            # initialize local loop variable
            new_scores = {k: [-np.inf, -np.inf] for k in params}
            new_params = {k: ( obod[k].cast(self.mix(v[0], v[1])),
                               obod[k].cast(self.mix(v[1], v[2])) ) for k,v in dict.items(ranges)}
            
            # finish condition check
            isdone = {k: v or i>=obod[k].lim for k,v in dict.items(isdone)}
            if all(isdone.values()):
                break
                
            # try new parameter values for all parameters
            for key in list(params):
                if i >= obod[key].lim:
                    continue
                orig = params[key]
                params[key] = new_params[key][0]
                scores[key][0] = yield params; assert (yield) == None
                params[key] = new_params[key][1]
                scores[key][1] = yield params; assert (yield) == None
                params[key] = orig
            
            #! start setting up values for next loop ######################## part B of loop work
            
            # set params to the best found and see if it betters score, updating ranges also
            #num_nochange = 0
            for key in list(params):
                if scores[key][0] > base_score and scores[key][0] >= scores[key][1]:
                    params[key] = new_params[key][0]
                    ranges[key] = [ranges[key][0], params[key], ranges[key][1]]
                elif scores[key][1] > base_score and scores[key][1] >= scores[key][0]:
                    params[key] = new_params[key][1]
                    ranges[key] = [ranges[key][1], params[key], ranges[key][2]]
                else:
                    ranges[key] = [new_params[key][0], ranges[key][1], new_params[key][1]]
                    #num_nochange += 1
                
            # send out new params
            #if num_nochange < len(params):
            base_score = yield params; assert (yield) == None

### specs for Search logic

In [5]:
class LPS(O()):
    class Discrete(O()):
        enc = {
            ('max_depth','num_leaves'): #10
                [(6,1<<6),(9,1<<7),(9,1<<9),(12,1<<8),(12,1<<10),(12,1<<12),(-1,1<<8),(-1,1<<10),(-1,1<<12),(-1,1<<14)],
            #('max','num'): #10
            #    [(6,6),(9,7),(9,9),(12,8),(12,10),(12,12),(-1,8),(-1,10),(-1,12),(-1,14)],
        }
        
    class OneByOne(O()):
        class info(O()):
            a = "main [a]rray data of 3 values, [min mid max]"
            b =  "[b]ack up value, i.e. default value if array doesn't give better results"
            cast = "function to apply to values before using to cast to the right dtype"
            lim = "maximum number of iterations of searching in this hyperparameter"
        class default(O()):
            cast = keepSigFig(2)
            lim = 2
        data = {
            'min_data_in_leaf': O(a=[1,60,375], cast=round),
            'min_sum_hessian_in_leaf': O(a=[0,50,200]),
            'lambda_l1': O(a=[0,.02,.2], b=0, lim=1),
            'lambda_l2': O(a=[0,.02,.2], b=0, lim=1),
        }

### Features / Samples(train/cv split) search

In [None]:
def KaggleMetric():
    def __init__(self, incr=0):
        self.incr = incr
    
    def attach(self, ms):
        L, s = ms._L, ms._s
        for Ltr, Lcv, tr, cv in zip(L.tr, L.cv, s.tr, s.cv):
            Ltr.timeFactor = ms.Y.time[tr].factorize()[0]
            Lcv.timeFactor = ms.Y.time[cv].factorize()[0]
            Ltr.value = (ms.Y.upDown1*ms.Y.absVal1)[tr]
            Lcv.value = (ms.Y.upDown1*ms.Y.absVal1)[cv]
            Ltr.i = 0
            Lcv.i = 0
    
    def __call__(self, preds, valid_data):
        df_time = valid_data.timeFactor
        #labels = valid_data.get_label()
        values = valid_data.value
        #assert len(labels) == len(df_time)

        preds = preds*2-1
        #labels = labels*2-1
        x_t = preds * values

        # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
        # is a pd.Series and call `group_by`
        x_t_sum = x_t.groupby(df_time).sum()
        score = x_t_sum.mean() / x_t_sum.std()

        valid_data.i += self.incr
        return 'kaggle', score+valid_data.i, True

In [5]:
class LMS(O()):
    model = O(
        time = 'time',
        value = 'y',
        target = 'target',
        weight = 'weight',
    )
    
    metrics = [KaggleMetric()]
    search = LPS
    
    class Features(O()):
        '''features selection groups'''
        data = [
            ['f1','f2','f3'],
            ['f4','f5','f6'],
        ]
    
    class Samples(O()):
        '''sample learning/cv split'''
        enc = [
            O(method='GroupShuffleSplit.2', kwargs=dict(n_splits=1, test_size=.5, random_state=44), groups='quarter'),
        ]
        
    class Params(O()):
        '''parameters constant settings'''
        data = dict(
            objective = 'binary',
            num_iterations = 10000,
            early_stopping_round = 50,
            learning_rate = .05,
            seed = 44,
            bagging_seed = 45,
            feature_fraction_seed = 46,
        )
        

from sklearn.model_selection import GroupShuffleSplit, GroupKFold
        
class ModelSearch():
    
    def __init__(self, specs, mm, *, X, Y):
        self.specs = specs
        self.mm = mm
        self.setup_specs()
        self.X = X
        self.Y = Y
        
    def setup_specs(self):
        self.specs.Samples.data = []
        for code in self.specs.Samples.enc:
            if code.method == 'GroupShuffleSplit.2':
                tr, cv = next(GroupShuffleSplit(**code.kwargs).split(self.X, self.Y, groups=Y[code.groups]))
                self.specs.Samples.data += [((tr, cv), (cv, tr))]
            elif code.method == 'GroupKFold':
                self.specs.Samples.data.append(list(GroupKFold(**code.kwargs).split(self.X, self.Y, groups=Y[code.groups])))
            else:
                assert False, f'sampling method "{code.method}" not implemented'
        
    def run(self):
        for feats, (samps, sampsEnc) in product(self.specs.Features.data,
                                                zip(self.specs.Samples.data, self.specs.Samples.enc)):
            self.setup_training(feats, samps)
            pm = self.mm.PM(feats, samps)
            self.mm.S.save(Samps=samps, Ctor=sampsEnc)
            search = ParamSearch(self.specs.search)
            loop = search.search()
            for params in loop:
                params = dict(**self.specs.Params.data, **params)
                self.train(params) # sets some state attributes in self: self._save
                pm.save(**self._save)
                loop.send(self._save.Results['score'])
                del self._save
                
    def setup_training(self, feats, samps):
        self._feats, self._samps = feats, samps
        _X = self.X[feats]
        _dummy = pd.Series(range(len(_X)),index=_X.index)
        _s = O()
        _s.tr, _s.cv = tuple(_dummy.isin(s[0]) for s in samps), tuple(_dummy.isin(s[1]) for s in samps)
        lgb_data_info = dict(
            feature_name = list(_X.columns),
            categorical_feature = list(_X.dtypes[_X.dtypes.isin([np.int64,np.int32])].index),
            free_raw_data = False,
        )
        _L = self._L = O()
        _L.tr = [lgb.Dataset(_X[tr], P[self.specs.model.target][tr], **lgb_data_info,
                            **({'weight': self.specs.model.weight} if 'weight' in self.specs.model else {}))
                for tr in _s.tr]
        _L.cv = [lgb.Dataset(_X[cv], P[self.specs.model.target][cv], reference=Ltr, **lgb_data_info,
                            **({'weight': self.specs.model.weight} if 'weight' in self.specs.model else {}))
                for cv, Ltr in zip(_s.cv, _L.tr)]
        for m in self.specs.metrics:
            if hasattr(m, 'attach'):
                m.attach(self)
        #TODO implement both logloss and Kaggle metric, and stop only when both don't improve in whatever num rounds
        
                
    def train(self, params):
        def iter_samples():
            evals_result = {}
            bst = lgb.train(params, Ltr, valid_sets=[Ltr,Lcv], valid_names=['tr','cv'],
                      feval=lgb_kaggle_metric, evals_result=evals_result, verbose_eval=False) #TODO lgb_kaggle_metric
            df_results = (pd.DataFrame(evals_result['tr']), pd.DataFrame(evals_results['cv']))
            yield bst, df_results
        bsts, dfs = list(zip(**iter_samples()))
        #TODO make self._save

# Live Run Testing

# -.-.-.-.-.-.-.-.-.-.-.-.-.- old shit / testing -.-.-.-.-.-.-.-.-.-.-.-.-.-.-

In [105]:
# PUREL TESTING
class HI(O()):
    class DERP():
        val = -99
        dir = Path('.')
        def yo(x):
            def y(a):
                return x(a) + 1000
            return y
        my = O(what=yo(lambda a: a ** 2))
        hm = O(func=lambda x: x * __class__.val, goto=lambda x: __class__.dir/x)
        class WHAT(O()):
            dude = lambda x: x + 10
        class YES(O()):
            dude = WHAT.dude
HI.DERP.my.what(5)
HI.DERP.hm.func(5)
HI.DERP.dir = Path('what')
HI.DERP.hm.goto('.git')
HI.DERP.WHAT.dude(0)

# ANOTHER PURE TESTING
def dothe(self):
    self.val = 4
    top = 100
    class Derp(O()):
        def func(x):
            return x + self.val
        def top(y):
            return y * top
    self.derp = Derp
item = Stop()
dothe(item)
item.derp.top(3)

class Yo(O()):
    class What(O()):
        derp = 5
    class Another(O()):
        yep = What.derp
Yo()

NameError: name 'What' is not defined