In [2]:
from LiveAMP import *
import miceforest as mf
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn import set_config
set_config(transform_output="pandas")

def feature_importance_df(self, dataset=0, iteration=None, normalize=True):
    targ = [self._get_var_name_from_scalar(int(i)) for i in np.sort(self.imputation_order)]
    feat = [self._get_var_name_from_scalar(int(i)) for i in np.sort(self.predictor_vars)]
    I = pd.DataFrame(self.get_feature_importance(dataset, iteration), index=targ, columns=feat).T
    return I / I.sum() * 100 if normalize else I
mf.ImputationKernel.feature_importance_df = feature_importance_df

def inspect(self, **kwargs):
    self.plot_imputed_distributions(wspace=0.3,hspace=0.3)
    plt.show()
    self.plot_mean_convergence(wspace=0.3, hspace=0.4)
    plt.show()
    I = self.feature_importance_df(**kwargs)
    I.disp(100)
    return I
mf.ImputationKernel.inspect = inspect


@dataclasses.dataclass
class AMP(MyBaseClass):
    cycle_day: int
    term_codes: typing.List
    infer_term: int
    crse: typing.List
    attr: typing.List
    fill: typing.Dict = None
    trf_grid: typing.Dict = None
    imp_grid: typing.Dict = None
    overwrite: typing.Dict = None
    show: typing.Dict = None
    inspect: bool = False

    def dump(self):
        return write(self.rslt, self, overwrite=True)

    def __post_init__(self):
        self.rslt = root_path / f"resources/rslt/{rjust(self.cycle_day,3,0)}/rslt.pkl"
        D = {'trm':False, 'adm':False, 'reg':False, 'flg':False, 'raw':False, 'term':False, 'raw_df':False, 'reg_df':False, 'X':False, 'Y':False, 'pred':False}
        for x in ['overwrite','show']:
            self[x] = D.copy() if self[x] is None else D.copy() | self[x]
        self.overwrite['raw'] |= self.overwrite['reg'] | self.overwrite['adm'] | self.overwrite['flg']
        self.overwrite['term'] |= self.overwrite['raw']
        self.overwrite['raw_df'] |= self.overwrite['term']
        self.overwrite['reg_df'] |= self.overwrite['term']
        self.overwrite['X'] |= self.overwrite['raw_df']
        self.overwrite['Y'] |= self.overwrite['reg_df'] | self.overwrite['X']
        self.overwrite['pred'] |= self.overwrite['Y']

        try:
            self.__dict__ = read(self.rslt).__dict__ | self.__dict__
        except:
            pass
        for k, v in self.overwrite.items():
            if v and k in self:
                del self[k]
        for k in ['fill','term','pred','trf_grid','imp_grid']:
            if k not in self:
                self[k] = dict()

        self.term_codes = uniquify([*listify(self.term_codes), self.infer_term])
        self.crse = uniquify(['_total', *listify(self.crse)])
        self.mlt_grp = ['crse','levl_code','styp_code','term_code']
        self.trf_list = cartesian({k: sorted(setify(v), key=str) for k,v in self.trf_grid.items()})
        self.trf_list = [mysort({k:v for k,v in t.items() if v not in ['drop',None,'']}) for t in self.trf_list]
        imp_default = {'iterations':3, 'mmc':0, 'mmf':'mean_match_default', 'datasets':5, 'tune':True}
        self.imp_list = cartesian(self.imp_grid)
        self.imp_list = [mysort(imp_default | v) for v in self.imp_list]
        self.params_list = [mysort({'imp':imp, 'trf':trf}) for trf, imp in it.product(self.trf_list,self.imp_list)]
        return self

    def get_terms(self):
        opts = {x:self[x] for x in ['cycle_day','overwrite','show']}
        for nm in self.term_codes:
            if nm not in self.term:
                print(f'get {nm}')
                self.term[nm] = TERM(term_code=nm, **opts).get_raw()


    def preprocess(self):
        def get(nm):
            if nm in self:
                return False
            print(f'get {nm}')
            return True

        if get('raw_df') or get('reg_df'):
            self.get_terms()

        if get('raw_df'):
            with warnings.catch_warnings(action='ignore'):
                self.raw_df = pd.concat([term.raw for term in self.term.values()], ignore_index=True).dropna(axis=1, how='all').prep()

        if get('reg_df'):
            with warnings.catch_warnings(action='ignore'):
                self.reg_df = {k: pd.concat([term.reg[k].query("crse in @self.crse") for term in self.term.values()]).prep() for k in ['cur','end']}

        where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
        if get('X'):
            R = self.raw_df.copy()
            repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
            R['hs_qrtl'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False).combine_first(R['apdc_code'].map(repl))
            R['remote'] = R['camp_code'] != 's'
            R['resd'] = R['resd_code'] == 'r'
            R['lgcy'] = ~R['lgcy_code'].isin(['n','o'])
            R['majr_code'] = R['majr_code'].replace({'0000':'und', 'eled':'eted', 'agri':'unda'})
            R['coll_code'] = R['coll_code'].replace({'ae':'an', 'eh':'ed', 'hs':'hl', 'st':'sm', '00':pd.NA})
            R['coll_desc'] = R['coll_desc'].replace({
                'ag & environmental_sciences':'ag & natural_resources',
                'education & human development':'education',
                'health science & human_service':'health sciences',
                'science & technology':'science & mathematics'})
            majr = ['majr_desc','dept_code','dept_desc','coll_code','coll_desc']
            S = R.sort_values('cycle_date').drop_duplicates(subset='majr_code', keep='last')[['majr_code',*majr]]
            X = where(R.drop(columns=majr).merge(S, on='majr_code', how='left')).prep().prep_bool()

            checks = [
                'cycle_day >= 0',
                'apdc_day >= cycle_day',
                'appl_day >= apdc_day',
                'birth_day >= appl_day',
                'birth_day >= 5000',
                'distance >= 0',
                'hs_pctl >=0',
                'hs_pctl <= 100',
                'hs_qrtl >= 0',
                'hs_qrtl <= 4',
                'act_equiv >= 1',
                'act_equiv <= 36',
                'gap_score >= 0',
                'gap_score <= 100',
            ]
            for check in checks:
                mask = X.eval(check)
                assert mask.all(), [check,X[~mask].disp(5)]
            
            for k, v in self.fill.items():
                X[k] = X.impute(k, *listify(v))
            self.X = X.prep().prep_bool().set_index(self.attr, drop=False).rename(columns=lambda x:'__'+x)
            self.X.missing().disp(100)

        if get('Y'):
            Y = {k: self.X[[]].join(y.set_index(['pidm','term_code','crse'])['credit_hr']) for k, y in self.reg_df.items()}
            agg = lambda y: where(y).groupby(self.mlt_grp)['credit_hr'].agg(lambda x: (x>0).sum())
            A = agg(self.reg_df['end'])
            B = agg(Y['end'])
            M = (A / B).replace(np.inf, pd.NA).rename('mlt').reset_index().query(f"term_code != {self.infer_term}").prep()
            N = M.assign(term_code=self.infer_term)
            self.mlt = pd.concat([M, N], axis=0).set_index(self.mlt_grp)
            Y = {k: y.squeeze().unstack().dropna(how='all', axis=1).fillna(0) for k, y in Y.items()}
            self.Y = Y['cur'].rename(columns=lambda x:x+'_cur').join(Y['end']>0).prep()
        return self.dump()


    def predict(self, params, crse, train_term, styp_code='all'):
        # for p, P in self.pred.items():
        #     if p == str(params):
        #         for c, C in P.items():
        #             if c == crse:
        #                 for t, T in C.items():
        #                     if t == train_term:
        #                         for s, S in T.items():
        #                             if s == styp_code:
        #                                 # print(ljust(crse,8), train_term, styp_code, 'reusing')
        #                                 return S, False

        print(ljust(crse,8), train_term, styp_code, 'creating')
        X = self.X.copy()
        if styp_code != 'all':
            X = X.query(f"styp_code==@styp_code")
        trf = ColumnTransformer([(c,t,["__"+c]) for c,t in params['trf'].items()], remainder='drop', verbose_feature_names_out=False)
        cols = uniquify(['_total_cur',crse+'_cur',crse])
        Z = trf.fit_transform(X).join(self.Y[cols]).prep().prep_bool().prep_category().sort_index()
        y = Z[crse].copy().rename('true').to_frame()
        Z.loc[Z.eval("term_code!=@train_term"), crse] = pd.NA

        iterations = params['imp'].pop('iterations')
        datasets = params['imp'].pop('datasets')
        tune = params['imp'].pop('tune')
        mmc = params['imp'].pop('mmc')
        mmf = params['imp'].pop('mmf')
        if mmc > 0 and mmf is not None:
            params['imp']['mean_match_scheme'] = getattr(mf, mmf).copy()
            params['imp']['mean_match_scheme'].set_mean_match_candidates(mmc)
        
        if tune:
            # print('tuning')
            imp = mf.ImputationKernel(Z, datasets=1, **params['imp'])
            imp.mice(iterations=1)
            optimal_parameters, losses = imp.tune_parameters(dataset=0, optimization_steps=5)
        else:
            # print('not tuning')
            optimal_parameters = None
        imp = mf.ImputationKernel(Z, datasets=datasets, **params['imp'])
        imp.mice(iterations=iterations, variable_parameters=optimal_parameters)
        if self.inspect:
            imp.inspect()

        Z.loc[:, crse] = pd.NA
        P = imp.impute_new_data(Z)
        details = pd.concat([y
                .assign(pred=P.complete_data(k)[crse], train_term=train_term, crse=crse, sim=k)
                .set_index(['train_term','crse','sim'], append=True)
            for k in range(P.dataset_count())]).prep_bool()
        agg = lambda x: pd.Series({
            'pred': x['pred'].sum(min_count=1),
            'true': x['true'].sum(min_count=1),
            'mse_pct': ((1*x['pred'] - x['true'])**2).mean()*100,
            'f1_inv_pct': (1-f1_score(x.dropna()['true'], x.dropna()['pred'], zero_division=np.nan))*100,
        })
        summary = details.groupby([*self.mlt_grp,'train_term','sim']).apply(agg).join(self.mlt).rename_axis(index={'term_code':'pred_term'})
        for x in ['pred','true']:
            summary[x] = summary[x] * summary['mlt']
        summary.insert(2, 'err', summary['pred'] - summary['true'])
        summary.insert(3, 'err_pct', (summary['err'] / summary['true']).clip(-1, 1) * 100)
        S = {'details':details, 'summary':summary.drop(columns='mlt').prep()}#, 'trf':trf, 'imp':imp}
        # S['summary'].disp(5)
        return S
        # return S, True


    def analyze(self, df):
        def pivot(df, val):
            Y = (
                df
                .reset_index()
                .pivot_table(columns='train_term', index=['crse','styp_code','pred_term'], values=val, aggfunc=['count',pctl(0),pctl(25),pctl(50),pctl(75),pctl(100)])
                .rename_axis(columns=[val,'train_term'])
                .stack(0, future_stack=True)
                .assign(abs_mean = lambda x: x.abs().mean(axis=1))
            )
            return Y
        mask = df.eval(f"pred_term!={self.infer_term}")
        rslt = {stat: pivot(df[mask], stat) for stat in ["pred","err","err_pct","mse_pct","f1_inv_pct"]} | {"proj": pivot(df[~mask], "pred")}
        # R = rslt["err_pct"].query("err_pct in [' 50%']")
        # R.disp(200)
        # df.query(f"train_term==202308 & pred_term!=202408")["err_pct"].abs().describe().to_frame().T.disp(200)
        return rslt


    def main(self, styp_codes=('n','t','r')):
        self = self.preprocess()
        g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
        # g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['summary']}
        start_time = time.perf_counter()
        L = len(self.params_list)
        k = 0
        for params in self.params_list:
            # print(str(params))
            try:
                P = self.pred[str(params)]
                S = P['summary']
            except:
                print(str(params))
                P = dict()
                for crse in self.crse:
                    for train_term in self.term_codes:
                        for styp_code in listify(styp_codes):
                            S = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
                            path = [crse,train_term,styp_code]
                            nest(path, P, S)
                        path.pop(-1)
                        nest(path, P, g(nest(path, P)))
                    path.pop(-1)
                    nest(path, P, g(nest(path, P)))
                P = g(P)
                S = P['summary']
            # if 'rslt' not in P:
            P['rslt'] = self.analyze(S)
            self.pred[str(params)] = P
            self.dump()
            k += 1
            print(k)
            # elapsed = (time.perf_counter() - start_time) / 60
            # rate = elapsed / k
            # remaining = rate * (L - k)
            # print(f"{k} / {L} = {round(k/L*100,1)}% complete, elapsed = {round(elapsed,1)} min, remaining = {round(remaining,1)} min @ {round(rate,1)} min per model")
            S.query(f"train_term==202308 & pred_term!=202408")["err_pct"].abs().describe().to_frame().T.disp(200)
            # print("\n========================================================================================================\n")
        return self

    # def main(self, styp_codes=('n','t','r')):
    #     self = self.preprocess()
    #     # g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
    #     # g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['summary']}
    #     start_time = time.perf_counter()
    #     L = len(self.params_list)
    #     k = 0
    #     P = self.pred
    #     for params in self.params_list:
    #         print(str(params))
    #         for crse in self.crse:
    #             dump = False
    #             for train_term in self.term_codes:
    #                 for styp_code in listify(styp_codes):
    #                     S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
    #                     path = [str(params),crse,train_term,styp_code]
    #                     P = nest(path, P, S)
    #                     dump |= new
    #                 path.pop(-1)
    #                 P = nest(path, P, g(nest(path, P)))
    #             path.pop(-1)
    #             P = nest(path, P, g(nest(path, P)))
    #         path.pop(-1)
    #         P = nest(path, P, g(nest(path, P)))
    #         # if dump:
    #         #     self.dump()

    #         self.analyze(str(params))
    #         k += 1
    #         elapsed = (time.perf_counter() - start_time) / 60
    #         rate = elapsed / k
    #         remaining = rate * (L - k)
    #         print(f"{k} / {L} = {round(k/L*100,1)}% complete, elapsed = {round(elapsed,1)} min, remaining = {round(remaining,1)} min @ {round(rate,1)} min per model")
    #         print("\n========================================================================================================\n")
    #         return self
    
    # def main(self, styp_codes=('n','t','r')):
    #     self = self.preprocess()
    #     # g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
    #     g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['summary']}
    #     start_time = time.perf_counter()
    #     L = len(self.params_list)
    #     k = 0
    #     for params in self.params_list:
    #         print(str(params))
    #         self.pred.setdefault(str(params), dict())
    #         for crse in self.crse:
    #             self.pred[str(params)].setdefault(crse, dict())
    #             for train_term in self.term_codes:
    #                 self.pred[str(params)][crse].setdefault(train_term, dict())
    #                 for styp_code in listify(styp_codes):
    #                     S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
    #                     self.pred[str(params)][crse][train_term][styp_code] = S
    #                     if new:
    #                         self.dump()
    #                 self.pred[str(params)][crse][train_term] = g(self.pred[str(params)][crse][train_term])
    #             self.pred[str(params)][crse] = g(self.pred[str(params)][crse])
    #         self.pred[str(params)] = g(self.pred[str(params)])
    #         self.analyze(str(params))
    #         k += 1
    #         elapsed = (time.perf_counter() - start_time) / 60
    #         rate = elapsed / k
    #         remaining = rate * (L - k)
    #         print(f"{k} / {L} = {round(k/L*100,1)}% complete, elapsed = {round(elapsed,1)} min, remaining = {round(remaining,1)} min @ {round(rate,1)} min per model")
    #         print("\n========================================================================================================\n")
    #     return self


code_desc = lambda x: [x+'_code', x+'_desc']
passthru = ['passthrough']
passdrop = ['passthrough', 'drop']
# passthru = passdrop
bintrf = lambda n_bins: KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform', subsample=None)
pwrtrf = make_pipeline(StandardScaler(), PowerTransformer())
kwargs = {
    'term_codes': np.arange(2020,2025)*100+8,
    'infer_term': 202408,
    'show': {
        # 'reg':True,
        # 'adm':True,
    },
    'fill': {
        'birth_day': ['median',['term_code','styp_code']],
        'remote': False,
        'international': False,
        **{f'race_{r}': False for r in ['american_indian','asian','black','pacific','white','hispanic']},
        'lgcy': False,
        'resd': False,
        'waiver': False,
        'fafsa_app': False,
        'schlship_app': False,
        'finaid_accepted': False,
        'ssb': False,
        'math': False,
        'reading': False,
        'writing': False,
        'gap_score': 0,
        'oriented': 'n',
    },
    'attr': [
        'pidm',
        *code_desc('term'),
        *code_desc('apdc'),
        *code_desc('levl'),
        *code_desc('styp'),
        *code_desc('admt'),
        *code_desc('camp'),
        *code_desc('coll'),
        *code_desc('dept'),
        *code_desc('majr'),
        *code_desc('cnty'),
        *code_desc('stat'),
        *code_desc('natn'),
        *code_desc('resd'),
        *code_desc('lgcy'),
        'international',
        'gender',
        *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
        'waiver',
        'birth_day',
        'distance',
        'hs_qrtl',
    ],
    'cycle_day': (TERM(term_code=202408).cycle_date-pd.Timestamp.now()).days+1,
    'cycle_day': 183,
    'crse': [
        # 'engl1301',
        # 'biol1406',
        # 'biol2401',
        # 'math1314',
        # 'math2412',
        # 'agri1419',
        # 'psyc2301',
        # 'ansc1319',
        # 'comm1311',
        # 'hist1301',
        # 'govt2306',
        # 'math1324',
        # 'chem1411',
        # 'univ0301',
        # 'univ0204',
        # 'univ0304',
        # 'agri1100',
        # 'comm1315',
        # 'agec2317',
        # 'govt2305',
        # 'busi1301',
        # 'arts1301',
        # 'math1342',
        # 'math2413',
        ],
    'trf_grid': {
        'appl_day': passdrop,
        'apdc_day': passdrop,
        'birth_day': [*passthru, pwrtrf],#, ],
        # 'levl_code': passthru,
        # 'styp_code': passthru,
        # 'admt_code': passdrop,
        # 'camp_code': passdrop,
        'remote': passthru,
        'coll_code': passdrop,
        'international': passthru,
        **{f'race_{r}': passthru for r in ['american_indian','asian','black','pacific','white','hispanic']},
        'gender': passthru,
        'lgcy': passdrop,
        'resd': passthru,
        'waiver': passdrop,
        # 'fafsa_app': passthru,
        'schlship_app': passthru,
        # 'finaid_accepted': passthru,
        'ssb': passthru,
        'math': passthru,
        'reading': passthru,
        'writing': passthru,
        'gap_score': passthru,
        'oriented': passthru,
        'hs_qrtl': passthru,
        'act_equiv': passthru,
        'distance': [*passthru, pwrtrf],#, bintrf(5)],
        },
    'imp_grid': {
        'mmc': 10
        # 'mmc': range(0, 41, 5),
        # 'datasets': 1,
        # 'iterations': 1,
        # 'tune': False,
    },
    'overwrite': {
        # # 'trm':True,
        # 'reg':True,
        # 'adm':True,
        # 'flg':True,
        # 'raw':True,
        # 'term': True,
        # 'raw_df': True,
        # 'reg_df': True,
        # 'X': True,
        # 'Y': True,
        # 'pred': True,
    },
    # 'inspect': True,
}

# FLAGS().run()
self = AMP(**kwargs)
self = self.preprocess()
self.term_codes.remove(self.infer_term)
# self.params_list = list(reversed(self.params_list))
self.main(styp_codes='n')
# len(self.params_list)
# for x in self.params_list:
#     print(x)
# T = TERM(202008, cycle_day=184, show={'adm':True}).get_adm(184)

_total   202108 n creating
_total   202208 n creating
_total   202308 n creating
92


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.04207,7.774839,0.09058,1.179113,9.314449,16.521729,18.081181


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('powertransformer', PowerTransformer())]), 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating
93


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.991921,6.662147,0.09058,6.968045,10.821492,16.912669,19.434194


94


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.952582,5.232257,0.181159,6.892084,10.308821,12.677698,15.252153


95


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.414736,7.63908,0.330215,0.792572,7.139143,16.592236,18.215223


96


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.772022,7.259692,0.04529,1.751256,9.180177,15.285106,18.204182


97


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.573376,6.382236,0.407609,2.200428,9.246617,14.140347,18.511685


98


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.45248,6.482041,0.996377,4.897611,11.83831,15.469608,20.479705


99


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.130482,6.855482,0.0,2.541965,9.746781,15.175616,19.580052


100


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.296203,7.758646,0.181159,3.562563,11.522454,17.258961,20.524934


101


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.513538,5.560523,0.09058,6.055942,10.855366,14.237659,16.297663


102


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.16424,6.978291,0.181159,4.786249,10.564485,15.727959,22.509225


103


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,5.962719,5.719137,0.110072,2.242934,3.630449,8.097113,18.582677


104


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.810763,7.023091,0.317029,2.892245,10.827776,16.266913,18.265683


105


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.813367,6.550571,0.226449,4.760308,9.728404,15.346607,18.634686


106


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.748789,6.351941,0.181159,4.523253,11.365046,14.720414,18.511685


107


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.374049,7.981668,0.04529,2.825026,11.904339,17.952756,20.418204


108


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.382394,7.865928,0.13587,1.774904,10.272684,16.732283,20.233702


109


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.028014,5.787542,0.0,6.023837,10.0,12.670144,18.819188


110


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.605924,6.840864,0.634058,4.396842,11.969992,16.378146,20.479705


111


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.085115,7.198582,0.181159,3.326941,12.037061,16.001225,20.367454


112


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.343565,7.19936,0.04529,2.962043,11.755418,16.620541,20.419948


113


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.688501,6.873427,0.226449,1.656519,9.114112,15.979555,18.142681


114


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,11.872948,7.273064,0.0,7.754168,14.452171,17.139108,20.418204


115


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.700048,8.028128,0.09058,0.661146,8.236445,16.388631,18.573186


116


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.222557,6.591452,0.04529,1.030344,9.361531,14.000663,17.322835


117


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.789028,6.307295,0.362319,4.546614,10.956091,14.887677,18.204182


118


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.747053,6.125828,0.362319,2.825026,9.195394,14.181582,19.126691


119


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,6.88635,5.546867,0.317029,1.42663,6.990598,12.776753,13.648294


120


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.794705,8.168588,0.181159,1.930122,10.562451,17.358549,21.207349


121


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.131391,6.569998,0.498188,5.075187,10.83585,15.942049,19.00369


122


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,11.699538,7.253605,0.452899,7.300839,13.224374,16.971724,21.771218


123


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.814251,6.571932,0.0,3.29627,8.717421,15.033511,18.267717


124


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,10.691263,8.051541,0.13587,3.945377,11.081974,18.635171,21.094711


125


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.318596,6.454992,0.13587,4.796855,9.987057,14.493976,19.126691


126


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,14.728211,8.379817,0.271739,12.623687,18.106769,19.787823,24.661747


127


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,8.935528,8.110028,0.0,1.031921,9.817895,16.30989,21.156212


128


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
err_pct,20.0,9.848626,7.728257,0.181159,2.899554,10.106778,17.107461,19.557196


AMP(cycle_day=183, term_codes=[202008, 202108, 202208, 202308], infer_term=202408, crse=['_total'], attr=['pidm', 'term_code', 'term_desc', 'apdc_code', 'apdc_desc', 'levl_code', 'levl_desc', 'styp_code', 'styp_desc', 'admt_code', 'admt_desc', 'camp_code', 'camp_desc', 'coll_code', 'coll_desc', 'dept_code', 'dept_desc', 'majr_code', 'majr_desc', 'cnty_code', 'cnty_desc', 'stat_code', 'stat_desc', 'natn_code', 'natn_desc', 'resd_code', 'resd_desc', 'lgcy_code', 'lgcy_desc', 'international', 'gender', 'race_american_indian', 'race_asian', 'race_black', 'race_pacific', 'race_white', 'race_hispanic', 'waiver', 'birth_day', 'distance', 'hs_qrtl'], fill={'birth_day': ['median', ['term_code', 'styp_code']], 'remote': False, 'international': False, 'race_american_indian': False, 'race_asian': False, 'race_black': False, 'race_pacific': False, 'race_white': False, 'race_hispanic': False, 'lgcy': False, 'resd': False, 'waiver': False, 'fafsa_app': False, 'schlship_app': False, 'finaid_accepted':

In [None]:
k = list(self.pred.keys())[10]
self.pred[k].keys()#['rslt']

In [4]:
self.dump()

AMP(cycle_day=183, term_codes=[202008, 202108, 202208, 202308], infer_term=202408, crse=['_total'], attr=['pidm', 'term_code', 'term_desc', 'apdc_code', 'apdc_desc', 'levl_code', 'levl_desc', 'styp_code', 'styp_desc', 'admt_code', 'admt_desc', 'camp_code', 'camp_desc', 'coll_code', 'coll_desc', 'dept_code', 'dept_desc', 'majr_code', 'majr_desc', 'cnty_code', 'cnty_desc', 'stat_code', 'stat_desc', 'natn_code', 'natn_desc', 'resd_code', 'resd_desc', 'lgcy_code', 'lgcy_desc', 'international', 'gender', 'race_american_indian', 'race_asian', 'race_black', 'race_pacific', 'race_white', 'race_hispanic', 'waiver', 'birth_day', 'distance', 'hs_qrtl'], fill={'birth_day': ['median', ['term_code', 'styp_code']], 'remote': False, 'international': False, 'race_american_indian': False, 'race_asian': False, 'race_black': False, 'race_pacific': False, 'race_white': False, 'race_hispanic': False, 'lgcy': False, 'resd': False, 'waiver': False, 'fafsa_app': False, 'schlship_app': False, 'finaid_accepted':

In [None]:
self.raw_df.query('pidm==1121725').disp(1)

In [5]:
for v in self.pred.values():
    print(v.keys())
    # pass
# A = list(self.pred.values())[0]
# A.keys()
# self.analyze(A['summary'])
# v['rslt']['proj']

dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', 'rslt'])
dict_keys(['details', 'summary', '

In [None]:
crse = '_total'
train_term = 202308
styp_code = 'n'
A = list(self.pred.values())[0]['summary'].query(f"crse==@crse & train_term==@train_term & styp_code==@styp_code & pred_term!=202408")
A['err_pct'].abs().describe().to_frame().T

In [None]:
g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
Q = dict()
for params, P in self.pred.items():
    print(str(params))
    for crse, C in P.items():
        if crse in self.crse:
            for train_term, T in C.items():
                if train_term in self.term_codes:
                    for styp_code, S in T.items():
                        if styp_code in ['n','r','t']:
                            path = [str(params),crse,train_term,styp_code]
                            nest(path, Q, S)
                    path.pop(-1)
                    nest(path, Q, g(nest(path, Q)))
            path.pop(-1)
            nest(path, Q, g(nest(path, Q)))
    path.pop(-1)
    nest(path, Q, g(nest(path, Q)))
    # nest(path, self.pred[params], g(nest(path, Q)))
    # self.pred[params] = g(Q)
    # break


In [None]:
g = lambda Y: {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
for params, P in self.pred.items():
    print(str(params))
    for crse in self.crse:
        for train_term in self.term_codes:
            for styp_code in ['n']:
                S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
                assert new is False
                path = [str(params),crse,train_term,styp_code]
                P = nest(path, P, S)
                dump |= new
            path.pop(-1)
            P = nest(path, P, g(nest(path, P)))
        path.pop(-1)
        P = nest(path, P, g(nest(path, P)))
    path.pop(-1)
    self.pred[str(params)] = nest(path, P, g(nest(path, P)))


In [None]:
k = 25
params = list(self.pred.keys())[k]
rslt = self.pred[params]
rslt.keys()

In [None]:
k = 25
params = list(self.pred.keys())[k]
rslt = self.pred[params]
df = rslt['summary']
# df.query(f"train_term==202308 & pred_term!=202408")['err_pct'].abs().describe().to_frame().T.disp(200)

def analyze(df):
    def pivot(df, val):
        Y = (
            df
            .reset_index()
            .pivot_table(columns='train_term', index=['crse','styp_code','pred_term'], values=val, aggfunc=['count',pctl(0),pctl(25),pctl(50),pctl(75),pctl(100)])
            .rename_axis(columns=[val,'train_term'])
            .stack(0, future_stack=True)
            .assign(abs_mean = lambda x: x.abs().mean(axis=1))
        )
        return Y
    # v = self.pred[params]
    # df = v['summary']
    mask = df.eval(f"pred_term!={self.infer_term}")
    rslt = {stat: pivot(df[mask], stat) for stat in ["pred","err","err_pct","mse_pct","f1_inv_pct"]} | {'pred': pivot(df[~mask], "pred")}
    
    return rslt
    # R = v['rslt']['err_pct'].query("err_pct in [' 50%']")
    # R.disp(200)
    # # R[['abs_mean']].describe().T.disp(200)
    # # df.query(f"crse==@crse & train_term==@train_term & styp_code==@styp_code & pred_term!=202408")
    # df.query(f"train_term==202308 & pred_term!=202408")['err_pct'].abs().describe().to_frame().T.disp(200)
    # return self.dump()

rslt = analyze(df)
rslt['pred']
# rslt.keys()
# for k, (params, rslt) in enumerate(self.pred.items()):
#     print(k, params)
#     df = rslt['summary']
#     df.query(f"train_term==202308 & pred_term!=202408")['err_pct'].abs().describe().to_frame().T.disp(200)