In [1]:
from LiveAMP import *
import miceforest as mf
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn import set_config
set_config(transform_output="pandas")

def feature_importance_df(self, dataset=0, iteration=None, normalize=True):
    targ = [self._get_var_name_from_scalar(int(i)) for i in np.sort(self.imputation_order)]
    feat = [self._get_var_name_from_scalar(int(i)) for i in np.sort(self.predictor_vars)]
    I = pd.DataFrame(self.get_feature_importance(dataset, iteration), index=targ, columns=feat).T
    return I / I.sum() * 100 if normalize else I
mf.ImputationKernel.feature_importance_df = feature_importance_df

def inspect(self, **kwargs):
    self.plot_imputed_distributions(wspace=0.3,hspace=0.3)
    plt.show()
    self.plot_mean_convergence(wspace=0.3, hspace=0.4)
    plt.show()
    I = self.feature_importance_df(**kwargs)
    I.disp(100)
    return I
mf.ImputationKernel.inspect = inspect


@dataclasses.dataclass
class AMP(MyBaseClass):
    cycle_day: int
    term_codes: typing.List
    infer_term: int
    crse: typing.List
    attr: typing.List
    fill: typing.Dict = None
    trf_grid: typing.Dict = None
    imp_grid: typing.Dict = None
    overwrite: typing.Dict = None
    show: typing.Dict = None
    inspect: bool = False

    def dump(self):
        return write(self.rslt, self, overwrite=True)

    def __post_init__(self):
        self.rslt = root_path / f"resources/rslt/{rjust(self.cycle_day,3,0)}/rslt.pkl"
        D = {'trm':False, 'adm':False, 'reg':False, 'flg':False, 'raw':False, 'term':False, 'raw_df':False, 'reg_df':False, 'X':False, 'Y':False, 'pred':False}
        for x in ['overwrite','show']:
            self[x] = D.copy() if self[x] is None else D.copy() | self[x]
        self.overwrite['raw'] |= self.overwrite['reg'] | self.overwrite['adm'] | self.overwrite['flg']
        self.overwrite['term'] |= self.overwrite['raw']
        self.overwrite['raw_df'] |= self.overwrite['term']
        self.overwrite['reg_df'] |= self.overwrite['term']
        self.overwrite['X'] |= self.overwrite['raw_df']
        self.overwrite['Y'] |= self.overwrite['reg_df'] | self.overwrite['X']
        self.overwrite['pred'] |= self.overwrite['Y']

        try:
            self.__dict__ = read(self.rslt).__dict__ | self.__dict__
        except:
            pass
        for k, v in self.overwrite.items():
            if v and k in self:
                del self[k]
        for k in ['fill','term','pred','trf_grid','imp_grid']:
            if k not in self:
                self[k] = dict()

        self.term_codes = uniquify([*listify(self.term_codes), self.infer_term])
        self.crse = uniquify(['_total', *listify(self.crse)])
        self.mlt_grp = ['crse','levl_code','styp_code','term_code']
        self.trf_list = cartesian({k: sorted(setify(v), key=str) for k,v in self.trf_grid.items()})
        self.trf_list = [mysort({k:v for k,v in t.items() if v not in ['drop',None,'']}) for t in self.trf_list]
        imp_default = {'iterations':3, 'mmc':0, 'mmf':'mean_match_default', 'datasets':5, 'tune':True}
        self.imp_list = cartesian(self.imp_grid)
        self.imp_list = [mysort(imp_default | v) for v in self.imp_list]
        self.params_list = [mysort({'imp':imp, 'trf':trf}) for trf, imp in it.product(self.trf_list,self.imp_list)]
        return self

    def get_terms(self):
        opts = {x:self[x] for x in ['cycle_day','overwrite','show']}
        for nm in self.term_codes:
            if nm not in self.term:
                print(f'get {nm}')
                self.term[nm] = TERM(term_code=nm, **opts).get_raw()


    def preprocess(self):
        def get(nm):
            if nm in self:
                return False
            print(f'get {nm}')
            return True

        if get('raw_df') or get('reg_df'):
            self.get_terms()

        if get('raw_df'):
            with warnings.catch_warnings(action='ignore'):
                self.raw_df = pd.concat([term.raw for term in self.term.values()], ignore_index=True).dropna(axis=1, how='all').prep()

        if get('reg_df'):
            with warnings.catch_warnings(action='ignore'):
                self.reg_df = {k: pd.concat([term.reg[k].query("crse in @self.crse") for term in self.term.values()]).prep() for k in ['cur','end']}

        where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
        if get('X'):
            R = self.raw_df.copy()
            repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
            R['hs_qrtl'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False).combine_first(R['apdc_code'].map(repl))
            R['remote'] = R['camp_code'] != 's'
            R['resd'] = R['resd_code'] == 'r'
            R['lgcy'] = ~R['lgcy_code'].isin(['n','o'])
            R['majr_code'] = R['majr_code'].replace({'0000':'und', 'eled':'eted', 'agri':'unda'})
            R['coll_code'] = R['coll_code'].replace({'ae':'an', 'eh':'ed', 'hs':'hl', 'st':'sm', '00':pd.NA})
            R['coll_desc'] = R['coll_desc'].replace({
                'ag & environmental_sciences':'ag & natural_resources',
                'education & human development':'education',
                'health science & human_service':'health sciences',
                'science & technology':'science & mathematics'})
            majr = ['majr_desc','dept_code','dept_desc','coll_code','coll_desc']
            S = R.sort_values('cycle_date').drop_duplicates(subset='majr_code', keep='last')[['majr_code',*majr]]
            X = where(R.drop(columns=majr).merge(S, on='majr_code', how='left')).prep().prep_bool()

            checks = [
                'cycle_day >= 0',
                'apdc_day >= cycle_day',
                'appl_day >= apdc_day',
                'birth_day >= appl_day',
                'birth_day >= 5000',
                'distance >= 0',
                'hs_pctl >=0',
                'hs_pctl <= 100',
                'hs_qrtl >= 0',
                'hs_qrtl <= 4',
                'act_equiv >= 1',
                'act_equiv <= 36',
                'gap_score >= 0',
                'gap_score <= 100',
            ]
            for check in checks:
                mask = X.eval(check)
                assert mask.all(), [check,X[~mask].disp(5)]
            
            for k, v in self.fill.items():
                X[k] = X.impute(k, *listify(v))
            self.X = X.prep().prep_bool().set_index(self.attr, drop=False).rename(columns=lambda x:'__'+x)
            self.X.missing().disp(100)

        if get('Y'):
            Y = {k: self.X[[]].join(y.set_index(['pidm','term_code','crse'])['credit_hr']) for k, y in self.reg_df.items()}
            agg = lambda y: where(y).groupby(self.mlt_grp)['credit_hr'].agg(lambda x: (x>0).sum())
            A = agg(self.reg_df['end'])
            B = agg(Y['end'])
            M = (A / B).replace(np.inf, pd.NA).rename('mlt').reset_index().query(f"term_code != {self.infer_term}").prep()
            N = M.assign(term_code=self.infer_term)
            self.mlt = pd.concat([M, N], axis=0).set_index(self.mlt_grp)
            Y = {k: y.squeeze().unstack().dropna(how='all', axis=1).fillna(0) for k, y in Y.items()}
            self.Y = Y['cur'].rename(columns=lambda x:x+'_cur').join(Y['end']>0).prep()
        return self.dump()


    def predict(self, params, crse, train_term, styp_code='all'):
        # for p, P in self.pred.items():
        #     if p == str(params):
        #         for c, C in P.items():
        #             if c == crse:
        #                 for t, T in C.items():
        #                     if t == train_term:
        #                         for s, S in T.items():
        #                             if s == styp_code:
        #                                 # print(ljust(crse,8), train_term, styp_code, 'reusing')
        #                                 return S, False

        print(ljust(crse,8), train_term, styp_code, 'creating')
        X = self.X.copy()
        if styp_code != 'all':
            X = X.query(f"styp_code==@styp_code")
        trf = ColumnTransformer([(c,t,["__"+c]) for c,t in params['trf'].items()], remainder='drop', verbose_feature_names_out=False)
        cols = uniquify(['_total_cur',crse+'_cur',crse])
        Z = trf.fit_transform(X).join(self.Y[cols]).prep().prep_bool().prep_category().sort_index()
        y = Z[crse].copy().rename('true').to_frame()
        Z.loc[Z.eval("term_code!=@train_term"), crse] = pd.NA

        iterations = params['imp'].pop('iterations')
        datasets = params['imp'].pop('datasets')
        tune = params['imp'].pop('tune')
        mmc = params['imp'].pop('mmc')
        mmf = params['imp'].pop('mmf')
        if mmc > 0 and mmf is not None:
            params['imp']['mean_match_scheme'] = getattr(mf, mmf).copy()
            params['imp']['mean_match_scheme'].set_mean_match_candidates(mmc)
        
        if tune:
            # print('tuning')
            imp = mf.ImputationKernel(Z, datasets=1, **params['imp'])
            imp.mice(iterations=1)
            optimal_parameters, losses = imp.tune_parameters(dataset=0, optimization_steps=5)
        else:
            # print('not tuning')
            optimal_parameters = None
        imp = mf.ImputationKernel(Z, datasets=datasets, **params['imp'])
        imp.mice(iterations=iterations, variable_parameters=optimal_parameters)
        if self.inspect:
            imp.inspect()

        Z.loc[:, crse] = pd.NA
        P = imp.impute_new_data(Z)
        details = pd.concat([y
                .assign(pred=P.complete_data(k)[crse], train_term=train_term, crse=crse, sim=k)
                .set_index(['train_term','crse','sim'], append=True)
            for k in range(P.dataset_count())]).prep_bool()
        agg = lambda x: pd.Series({
            'pred': x['pred'].sum(min_count=1),
            'true': x['true'].sum(min_count=1),
            'mse_pct': ((1*x['pred'] - x['true'])**2).mean()*100,
            'f1_inv_pct': (1-f1_score(x.dropna()['true'], x.dropna()['pred'], zero_division=np.nan))*100,
        })
        summary = details.groupby([*self.mlt_grp,'train_term','sim']).apply(agg).join(self.mlt).rename_axis(index={'term_code':'pred_term'})
        for x in ['pred','true']:
            summary[x] = summary[x] * summary['mlt']
        summary.insert(2, 'err', summary['pred'] - summary['true'])
        summary.insert(3, 'err_pct', (summary['err'] / summary['true']).clip(-1, 1) * 100)
        S = {'details':details, 'summary':summary.drop(columns='mlt').prep()}#, 'trf':trf, 'imp':imp}
        # S['summary'].disp(5)
        return S
        # return S, True


    def analyze(self, params):
        def pivot(df, val):
            Y = (
                df
                .reset_index()
                .pivot_table(columns='train_term', index=['crse','styp_code','pred_term'], values=val, aggfunc=['count',pctl(0),pctl(25),pctl(50),pctl(75),pctl(100)])
                .rename_axis(columns=[val,'train_term'])
                .stack(0, future_stack=True)
                .assign(abs_mean = lambda x: x.abs().mean(axis=1))
            )
            return Y
        v = self.pred[params]
        df = v['summary']
        mask = df.eval(f"pred_term!={self.infer_term}")
        v['rslt'] = {stat: pivot(df[mask], stat) for stat in ["pred","err","err_pct","mse_pct","f1_inv_pct"]} | {'pred': pivot(df[~mask], "pred")}
        R = v['rslt']['err_pct'].query("err_pct in [' 50%']")
        R.disp(200)
        R[['abs_mean']].describe().T.disp(200)
        return self.dump()


    def main(self, styp_codes=('n','t','r')):
        self = self.preprocess()
        g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
        # g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['summary']}
        start_time = time.perf_counter()
        L = len(self.params_list)
        k = 0
        for params in self.params_list:
            print(str(params))
            if str(params) not in self.pred:
                P = dict()
                for crse in self.crse:
                    for train_term in self.term_codes:
                        for styp_code in listify(styp_codes):
                            S = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
                            path = [crse,train_term,styp_code]
                            nest(path, P, S)
                        path.pop(-1)
                        nest(path, P, g(nest(path, P)))
                    path.pop(-1)
                    nest(path, P, g(nest(path, P)))
                self.pred[str(params)] = g(P)
                # path.pop(-1)
                # nest(path, P, g(nest(path, P)))
                # self.pred[str(params)] = P
                self.dump()


                            # if new:
                            #     self.dump()
            #             dump |= new
            #         path.pop(-1)
            #         P = nest(path, P, g(nest(path, P)))
            #     path.pop(-1)
            #     P = nest(path, P, g(nest(path, P)))
            # path.pop(-1)
            # P = nest(path, P, g(nest(path, P)))
            # if dump:
            #     self.dump()

            self.analyze(str(params))
            k += 1
            elapsed = (time.perf_counter() - start_time) / 60
            rate = elapsed / k
            remaining = rate * (L - k)
            print(f"{k} / {L} = {round(k/L*100,1)}% complete, elapsed = {round(elapsed,1)} min, remaining = {round(remaining,1)} min @ {round(rate,1)} min per model")
            print("\n========================================================================================================\n")
        return self



    # def main(self, styp_codes=('n','t','r')):
    #     self = self.preprocess()
    #     # g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
    #     # g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['summary']}
    #     start_time = time.perf_counter()
    #     L = len(self.params_list)
    #     k = 0
    #     P = self.pred
    #     for params in self.params_list:
    #         print(str(params))
    #         for crse in self.crse:
    #             dump = False
    #             for train_term in self.term_codes:
    #                 for styp_code in listify(styp_codes):
    #                     S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
    #                     path = [str(params),crse,train_term,styp_code]
    #                     P = nest(path, P, S)
    #                     dump |= new
    #                 path.pop(-1)
    #                 P = nest(path, P, g(nest(path, P)))
    #             path.pop(-1)
    #             P = nest(path, P, g(nest(path, P)))
    #         path.pop(-1)
    #         P = nest(path, P, g(nest(path, P)))
    #         # if dump:
    #         #     self.dump()

    #         self.analyze(str(params))
    #         k += 1
    #         elapsed = (time.perf_counter() - start_time) / 60
    #         rate = elapsed / k
    #         remaining = rate * (L - k)
    #         print(f"{k} / {L} = {round(k/L*100,1)}% complete, elapsed = {round(elapsed,1)} min, remaining = {round(remaining,1)} min @ {round(rate,1)} min per model")
    #         print("\n========================================================================================================\n")
    #         return self
    
    # def main(self, styp_codes=('n','t','r')):
    #     self = self.preprocess()
    #     # g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
    #     g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['summary']}
    #     start_time = time.perf_counter()
    #     L = len(self.params_list)
    #     k = 0
    #     for params in self.params_list:
    #         print(str(params))
    #         self.pred.setdefault(str(params), dict())
    #         for crse in self.crse:
    #             self.pred[str(params)].setdefault(crse, dict())
    #             for train_term in self.term_codes:
    #                 self.pred[str(params)][crse].setdefault(train_term, dict())
    #                 for styp_code in listify(styp_codes):
    #                     S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
    #                     self.pred[str(params)][crse][train_term][styp_code] = S
    #                     if new:
    #                         self.dump()
    #                 self.pred[str(params)][crse][train_term] = g(self.pred[str(params)][crse][train_term])
    #             self.pred[str(params)][crse] = g(self.pred[str(params)][crse])
    #         self.pred[str(params)] = g(self.pred[str(params)])
    #         self.analyze(str(params))
    #         k += 1
    #         elapsed = (time.perf_counter() - start_time) / 60
    #         rate = elapsed / k
    #         remaining = rate * (L - k)
    #         print(f"{k} / {L} = {round(k/L*100,1)}% complete, elapsed = {round(elapsed,1)} min, remaining = {round(remaining,1)} min @ {round(rate,1)} min per model")
    #         print("\n========================================================================================================\n")
    #     return self


code_desc = lambda x: [x+'_code', x+'_desc']
passthru = ['passthrough']
passdrop = ['passthrough', 'drop']
# passthru = passdrop
bintrf = lambda n_bins: KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform', subsample=None)
pwrtrf = make_pipeline(StandardScaler(), PowerTransformer())
kwargs = {
    'term_codes': np.arange(2020,2025)*100+8,
    'infer_term': 202408,
    'show': {
        # 'reg':True,
        # 'adm':True,
    },
    'fill': {
        'birth_day': ['median',['term_code','styp_code']],
        'remote': False,
        'international': False,
        **{f'race_{r}': False for r in ['american_indian','asian','black','pacific','white','hispanic']},
        'lgcy': False,
        'resd': False,
        'waiver': False,
        'fafsa_app': False,
        'schlship_app': False,
        'finaid_accepted': False,
        'ssb': False,
        'math': False,
        'reading': False,
        'writing': False,
        'gap_score': 0,
        'oriented': 'n',
    },
    'attr': [
        'pidm',
        *code_desc('term'),
        *code_desc('apdc'),
        *code_desc('levl'),
        *code_desc('styp'),
        *code_desc('admt'),
        *code_desc('camp'),
        *code_desc('coll'),
        *code_desc('dept'),
        *code_desc('majr'),
        *code_desc('cnty'),
        *code_desc('stat'),
        *code_desc('natn'),
        *code_desc('resd'),
        *code_desc('lgcy'),
        'international',
        'gender',
        *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
        'waiver',
        'birth_day',
        'distance',
        'hs_qrtl',
    ],
    'cycle_day': (TERM(term_code=202408).cycle_date-pd.Timestamp.now()).days+1,
    'cycle_day': 183,
    'crse': [
        # 'engl1301',
        # 'biol1406',
        # 'biol2401',
        # 'math1314',
        # 'math2412',
        # 'agri1419',
        # 'psyc2301',
        # 'ansc1319',
        # 'comm1311',
        # 'hist1301',
        # 'govt2306',
        # 'math1324',
        # 'chem1411',
        # 'univ0301',
        # 'univ0204',
        # 'univ0304',
        # 'agri1100',
        # 'comm1315',
        # 'agec2317',
        # 'govt2305',
        # 'busi1301',
        # 'arts1301',
        # 'math1342',
        # 'math2413',
        ],
    'trf_grid': {
        'appl_day': passdrop,
        'apdc_day': passdrop,
        'birth_day': [*passthru, pwrtrf],#, ],
        # 'levl_code': passthru,
        # 'styp_code': passthru,
        # 'admt_code': passdrop,
        # 'camp_code': passdrop,
        'remote': passthru,
        'coll_code': passdrop,
        'international': passthru,
        **{f'race_{r}': passthru for r in ['american_indian','asian','black','pacific','white','hispanic']},
        'gender': passthru,
        'lgcy': passdrop,
        'resd': passthru,
        'waiver': passdrop,
        # 'fafsa_app': passthru,
        'schlship_app': passthru,
        # 'finaid_accepted': passthru,
        'ssb': passthru,
        'math': passthru,
        'reading': passthru,
        'writing': passthru,
        'gap_score': passthru,
        'oriented': passthru,
        'hs_qrtl': passthru,
        'act_equiv': passthru,
        'distance': [*passthru, pwrtrf],#, bintrf(5)],
        },
    'imp_grid': {
        'mmc': 10
        # 'mmc': range(0, 41, 5),
        # 'datasets': 1,
        # 'iterations': 1,
        # 'tune': False,
    },
    'overwrite': {
        # # 'trm':True,
        # 'reg':True,
        # 'adm':True,
        # 'flg':True,
        # 'raw':True,
        # 'term': True,
        # 'raw_df': True,
        # 'reg_df': True,
        # 'X': True,
        # 'Y': True,
        # 'pred': True,
    },
    # 'inspect': True,
}

# FLAGS().run()
self = AMP(**kwargs)
self = self.preprocess()
self.term_codes.remove(self.infer_term)
# self.params_list = list(reversed(self.params_list))
self.main(styp_codes='n')
# len(self.params_list)
# for x in self.params_list:
#     print(x)
# T = TERM(202008, cycle_day=184, show={'adm':True}).get_adm(184)

{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,0.715465,-3.026968,-0.165107,-5.503577,2.352779
_total,n,202108,50%,15.252153,-0.98401,0.738007,-10.332103,6.826568
_total,n,202208,50%,14.593176,-0.104987,0.472441,-9.973753,6.286089
_total,n,202308,50%,14.855072,2.400362,0.13587,0.407609,4.449728


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,4.978791,2.024776,2.352779,3.925491,5.367909,6.421209,6.826568


1 / 128 = 0.8% complete, elapsed = 3.3 min, remaining = 423.1 min @ 3.3 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,0.110072,-3.137039,-0.770501,-6.274078,2.572922
_total,n,202108,50%,12.915129,0.430504,0.184502,-9.04059,5.642681
_total,n,202208,50%,14.645669,-0.682415,-0.787402,-7.979003,6.023622
_total,n,202308,50%,14.130435,3.940217,-1.358696,-0.226449,4.913949


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,4.788294,1.547005,2.572922,4.328693,5.278315,5.737917,6.023622


2 / 128 = 1.6% complete, elapsed = 6.6 min, remaining = 412.7 min @ 3.3 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,1.706109,7.374794,-1.816181,-6.7694,4.416621
_total,n,202108,50%,13.530135,-0.553506,0.553506,-14.02214,7.164822
_total,n,202208,50%,14.698163,-5.616798,0.262467,-19.370079,9.986877
_total,n,202308,50%,14.085145,-0.271739,1.268116,-0.271739,3.974185


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,6.385626,2.784971,3.974185,4.306012,5.790721,7.870335,9.986877


3 / 128 = 2.3% complete, elapsed = 9.8 min, remaining = 409.5 min @ 3.3 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-0.055036,6.274078,-1.155751,-4.072647,2.889378
_total,n,202108,50%,14.514145,0.430504,2.02952,-9.409594,6.595941
_total,n,202208,50%,14.593176,-5.249344,0.419948,-11.023622,7.821522
_total,n,202308,50%,14.402174,1.902174,0.724638,0.04529,4.268569


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,5.393853,2.2269,2.889378,3.923771,5.432255,6.902336,7.821522


4 / 128 = 3.1% complete, elapsed = 13.0 min, remaining = 404.5 min @ 3.3 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'writing': 'passthrough'}}

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,0.275179,-2.86186,-3.192075,-13.098514,4.856907
_total,n,202108,50%,14.698647,-0.922509,0.676507,-12.97663,7.318573
_total,n,202208,50%,13.753281,0.997375,-0.104987,-14.330709,7.296588
_total,n,202308,50%,13.632246,2.626812,-1.086957,0.09058,4.359149


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,5.957804,1.571805,4.359149,4.732467,6.076747,7.302084,7.318573


5 / 128 = 3.9% complete, elapsed = 16.4 min, remaining = 404.2 min @ 3.3 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-0.330215,-2.036324,-1.651073,-7.870116,2.971932
_total,n,202108,50%,13.591636,0.676507,1.291513,-8.241082,5.950185
_total,n,202208,50%,14.908136,-0.15748,-1.312336,-10.498688,6.71916
_total,n,202308,50%,13.903986,1.630435,0.951087,0.271739,4.189312


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,4.957647,1.695235,2.971932,3.884967,5.069748,6.142428,6.71916


6 / 128 = 4.7% complete, elapsed = 19.4 min, remaining = 394.7 min @ 3.2 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', '

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-0.38525,7.539901,-0.055036,-4.678041,3.164557
_total,n,202108,50%,15.375154,-1.04551,-1.414514,-11.746617,7.395449
_total,n,202208,50%,14.540682,-4.304462,1.049869,-15.065617,8.740157
_total,n,202308,50%,14.764493,1.856884,-0.815217,0.362319,4.449728


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,5.937473,2.574437,3.164557,4.128435,5.922589,7.731626,8.740157


7 / 128 = 5.5% complete, elapsed = 22.7 min, remaining = 392.0 min @ 3.2 min per model


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'birth_day': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n c

In [None]:
g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
Q = dict()
for params, P in self.pred.items():
    print(str(params))
    for crse, C in P.items():
        if crse in self.crse:
            for train_term, T in C.items():
                if train_term in self.term_codes:
                    for styp_code, S in T.items():
                        if styp_code in ['n','r','t']:
                            path = [str(params),crse,train_term,styp_code]
                            nest(path, Q, S)
                    path.pop(-1)
                    nest(path, Q, g(nest(path, Q)))
            path.pop(-1)
            nest(path, Q, g(nest(path, Q)))
    path.pop(-1)
    nest(path, Q, g(nest(path, Q)))
    # nest(path, self.pred[params], g(nest(path, Q)))
    # self.pred[params] = g(Q)
    # break


In [None]:
g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
for params, P in self.pred.items():
    print(str(params))
    for crse in self.crse:
        for train_term in self.term_codes:
            for styp_code in ['n']:
                S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
                assert new is False
                path = [str(params),crse,train_term,styp_code]
                P = nest(path, P, S)
                dump |= new
            path.pop(-1)
            P = nest(path, P, g(nest(path, P)))
        path.pop(-1)
        P = nest(path, P, g(nest(path, P)))
    path.pop(-1)
    self.pred[str(params)] = nest(path, P, g(nest(path, P)))
