In [2]:
from LiveAMP import *
import miceforest as mf
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn import set_config
set_config(transform_output="pandas")

def feature_importance_df(self, dataset=0, iteration=None, normalize=True):
    targ = [self._get_var_name_from_scalar(int(i)) for i in np.sort(self.imputation_order)]
    feat = [self._get_var_name_from_scalar(int(i)) for i in np.sort(self.predictor_vars)]
    I = pd.DataFrame(self.get_feature_importance(dataset, iteration), index=targ, columns=feat).T
    return I / I.sum() * 100 if normalize else I
mf.ImputationKernel.feature_importance_df = feature_importance_df

def inspect(self, **kwargs):
    self.plot_imputed_distributions(wspace=0.3,hspace=0.3)
    plt.show()
    self.plot_mean_convergence(wspace=0.3, hspace=0.4)
    plt.show()
    I = self.feature_importance_df(**kwargs)
    I.disp(100)
    return I
mf.ImputationKernel.inspect = inspect

@dataclasses.dataclass
class AMP(MyBaseClass):
    cycle_day: int
    term_codes: typing.List
    infer_term: int
    crse: typing.List
    attr: typing.List
    fill: typing.Dict = None
    trf_grid: typing.Dict = None
    imp_grid: typing.Dict = None
    overwrite: typing.Dict = None
    show: typing.Dict = None
    inspect: bool = False

    def dump(self):
        return write(self.rslt, self, overwrite=True)

    def __post_init__(self):
        D = {'adm':False, 'reg':False, 'flg':False, 'raw':False, 'term':False, 'raw_df':False, 'reg_df':False, 'X':False, 'Y':False, 'pred':False}
        for x in ['overwrite','show']:
            self[x] = D.copy() if self[x] is None else D.copy() | self[x]
        self.overwrite['raw'] |= self.overwrite['reg'] | self.overwrite['adm'] | self.overwrite['flg']
        self.overwrite['term'] |= self.overwrite['raw']
        self.overwrite['raw_df'] |= self.overwrite['term']
        self.overwrite['reg_df'] |= self.overwrite['term']
        self.overwrite['X'] |= self.overwrite['raw_df']
        self.overwrite['Y'] |= self.overwrite['reg_df'] | self.overwrite['X']
        self.overwrite['pred'] |= self.overwrite['Y']
        self.path = root_path / f"resources/rslt/{rjust(self.cycle_day,3,0)}"
        self.rslt = self.path / f"rslt.pkl"
        self.tune = self.path / f"tune.pkl"
        try:
            self.__dict__ = read(self.rslt).__dict__ | self.__dict__
        except:
            pass
        for k, v in self.overwrite.items():
            if v and k in self:
                del self[k]

        for k in ['fill','term','pred','trf_grid','imp_grid']:
            self[k] = self[k] if k in self else dict()

        self.term_codes = uniquify([*listify(self.term_codes), self.infer_term])
        self.crse = uniquify(['_total', *listify(self.crse)])
        self.mlt_grp = ['crse','levl_code','styp_code','term_code']
        self.trf_list = cartesian({k: sorted(setify(v), key=str) for k,v in self.trf_grid.items()})
        self.trf_list = [mysort({k:v for k,v in t.items() if v not in ['drop',None,'']}) for t in self.trf_list]
        imp_default = {'iterations':3, 'mmc':0, 'mmf':'mean_match_default', 'datasets':5, 'tune':True}
        self.imp_list = cartesian(self.imp_grid)
        self.imp_list = [mysort(imp_default | v) for v in self.imp_list]
        self.params_list = [mysort({'imp':imp, 'trf':trf}) for trf, imp in it.product(self.trf_list,self.imp_list)]

        opts = {x:self[x] for x in ['cycle_day','overwrite','show']}
        for nm in self.term_codes:
            if nm not in self.term:
                print(f'get {nm}')
                self.term[nm] = TERM(term_code=nm, **opts).get_raw()
        return self.dump()


    def preprocess(self):
        def get(nm):
            if nm in self:
                return False
            print(f'get {nm}')
            return True

        if get('raw_df'):
            self.raw_df = pd.concat([term.raw for term in self.term.values()], ignore_index=True).dropna(axis=1, how='all').prep()

        if get('reg_df'):
            with warnings.catch_warnings(action='ignore'):
                self.reg_df = {k: pd.concat([term.reg[k].query("crse in @self.crse") for term in self.term.values()]) for k in ['cur','end']}

        where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
        if get('X'):
            R = self.raw_df.copy()
            repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
            R['hs_qrtl'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False).combine_first(R['apdc_code'].map(repl))
            R['remote'] = R['camp_code'] != 's'
            R['resd'] = R['resd_code'] == 'r'
            R['lgcy'] = ~R['lgcy_code'].isin(['n','o'])
            R['majr_code'] = R['majr_code'].replace({'0000':'und', 'eled':'eted', 'agri':'unda'})
            R['coll_code'] = R['coll_code'].replace({'ae':'an', 'eh':'ed', 'hs':'hl', 'st':'sm', '00':pd.NA})
            R['coll_desc'] = R['coll_desc'].replace({
                'ag & environmental sciences':'ag & natural resources',
                'education & human development':'education',
                'health science & human service':'health sciences',
                'science & technology':'science & mathematics'})
            majr = ['majr_desc','dept_code','dept_desc','coll_code','coll_desc']
            S = R.sort_values('cycle_date').drop_duplicates(subset='majr_code', keep='last')[['majr_code',*majr]]
            X = where(R.drop(columns=majr).merge(S, on='majr_code', how='left')).prep().binarize()

            checks = [
                'cycle_day >= 0',
                'apdc_day >= cycle_day',
                'appl_day >= apdc_day',
                'birth_day >= appl_day',
                'birth_day >= 5000',
                'distance >= 0',
                'hs_pctl >=0',
                'hs_pctl <= 100',
                'hs_qrtl >= 0',
                'hs_qrtl <= 4',
                'act_equiv >= 1',
                'act_equiv <= 36',
                'gap_score >= 0',
                'gap_score <= 100',
            ]
            for check in checks:
                mask = X.eval(check)
                assert mask.all(), [check,X[~mask].disp(5)]

            for k, v in self.fill.items():
                X[k] = X.impute(k, *listify(v))

            self.X = X.prep().binarize().set_index(self.attr, drop=False).rename(columns=lambda x:'__'+x)
            self.X.missing().disp(100)

        if get('Y'):
            self.Y = {k: self.X[[]].join(y.set_index(['pidm','term_code','crse'])['credit_hr']) for k, y in self.reg_df.items()}
            agg = lambda y: where(y).groupby(self.mlt_grp)['credit_hr'].agg(lambda x: (x>0).sum())
            A = agg(self.reg_df['end'])
            B = agg(self.Y['end'])
            M = (A / B).replace(np.inf, pd.NA).rename('mlt').reset_index().query(f"term_code != {self.infer_term}").prep()
            N = M.assign(term_code=self.infer_term)
            self.mlt = pd.concat([M, N], axis=0).set_index(self.mlt_grp)
            Y = {k: y.squeeze().unstack().dropna(how='all', axis=1).fillna(0) for k, y in self.Y.items()}
            self.Y = Y['cur'].rename(columns=lambda x:x+'_cur').join(Y['end']>0).prep()
        return self.dump()


    def analyze(self, params):
        def pivot(df, val):
            Y = (
                df
                .reset_index()
                .pivot_table(columns='train_term', index=['crse','styp_code','pred_term'], values=val, aggfunc=['count',pctl(0),pctl(25),pctl(50),pctl(75),pctl(100)])
                .rename_axis(columns=[val,'train_term'])
                .stack(0, future_stack=True)
                .assign(abs_mean = lambda x: x.abs().mean(axis=1))
            )
            return Y
        v = self.pred[params]
        df = v['summary']
        mask = df.eval(f"pred_term!={self.infer_term}")
        v['rslt'] = {stat: pivot(df[mask], stat) for stat in ["pred","err","err_pct","mse_pct","f1_inv_pct"]} | {'pred': pivot(df[~mask], "pred")}
        R = v['rslt']['err_pct'].query("err_pct in [' 50%']")
        R.disp(200)
        R[['abs_mean']].describe().T.disp(200)
        # v['rslt']['err_pct'].query("err_pct in [' 50%']").describe().disp(200)
        # v['rslt']['err_pct'].query("err_pct in ['count',' 50%']").disp(200)
        return self.dump()


    def main(self):
        self = self.preprocess()
        g = lambda Y: Y | {k: pd.concat([y[k] for y in Y.values() if isinstance(y, dict) and k in y.keys()]).sort_index() for k in ['details','summary']}
        for params in self.params_list:
            print(str(params))
            self.pred.setdefault(str(params), dict())
            for crse in self.crse:
                self.pred[str(params)].setdefault(crse, dict())
                for train_term in self.term_codes:
                    self.pred[str(params)][crse].setdefault(train_term, dict())
                    for styp_code in ['n']:#,'r','t']:
                        S, new = self.predict(copy.deepcopy(params), crse, train_term, styp_code)
                        self.pred[str(params)][crse][train_term][styp_code] = S
                        if new:
                            self.dump()
                    self.pred[str(params)][crse][train_term] = g(self.pred[str(params)][crse][train_term])
                self.pred[str(params)][crse] = g(self.pred[str(params)][crse])
            self.pred[str(params)] = g(self.pred[str(params)])
            self.analyze(str(params))
            # self.dump()
        return self
    
    
    def predict(self, params, crse, train_term, styp_code='all'):
        for p, P in self.pred.items():
            if p == str(params):
                for c, C in P.items():
                    if c == crse:
                        for t, T in C.items():
                            if t == train_term:
                                for s, S in T.items():
                                    if s == styp_code:
                                        # print(ljust(crse,8), train_term, styp_code, 'reusing')
                                        return S, False

        print(ljust(crse,8), train_term, styp_code, 'creating')
        X = self.X.copy()
        if styp_code != 'all':
            X = X.query(f"styp_code==@styp_code")
        trf = ColumnTransformer([(c,t,["__"+c]) for c,t in params['trf'].items()], remainder='drop', verbose_feature_names_out=False)
        cols = uniquify(['_total_cur',crse+'_cur',crse])
        Z = trf.fit_transform(X).join(self.Y[cols]).prep().categorize().sort_index()
        y = Z[crse].copy().rename('true').to_frame()
        Z.loc[Z.eval("term_code!=@train_term"), crse] = pd.NA

        iterations = params['imp'].pop('iterations')
        datasets = params['imp'].pop('datasets')
        tune = params['imp'].pop('tune')
        mmc = params['imp'].pop('mmc')
        mmf = params['imp'].pop('mmf')
        if mmc > 0 and mmf is not None:
            params['imp']['mean_match_scheme'] = getattr(mf, mmf).copy()
            params['imp']['mean_match_scheme'].set_mean_match_candidates(mmc)
        
        if tune:
            # print('tuning')
            imp = mf.ImputationKernel(Z, datasets=1, **params['imp'])
            imp.mice(iterations=1)
            optimal_parameters, losses = imp.tune_parameters(dataset=0, optimization_steps=5)
        else:
            # print('not tuning')
            optimal_parameters = None
        imp = mf.ImputationKernel(Z, datasets=datasets, **params['imp'])
        imp.mice(iterations=iterations, variable_parameters=optimal_parameters)

        if self.inspect:
            imp.inspect()
        Z.loc[:, crse] = pd.NA
        P = imp.impute_new_data(Z)
        details = pd.concat([y
                .assign(pred=P.complete_data(k)[crse], train_term=train_term, crse=crse, sim=k)
                .set_index(['train_term','crse','sim'], append=True)
            for k in range(P.dataset_count())]).binarize()
        agg = lambda x: pd.Series({
            'pred': x['pred'].sum(min_count=1),
            'true': x['true'].sum(min_count=1),
            'mse_pct': ((1*x['pred'] - x['true'])**2).mean()*100,
            'f1_inv_pct': (1-f1_score(x.dropna()['true'], x.dropna()['pred'], zero_division=np.nan))*100,
        })
        summary = details.groupby([*self.mlt_grp,'train_term','sim']).apply(agg).join(self.mlt).rename_axis(index={'term_code':'pred_term'})
        for x in ['pred','true']:
            summary[x] = summary[x] * summary['mlt']
        summary.insert(2, 'err', summary['pred'] - summary['true'])
        summary.insert(3, 'err_pct', (summary['err'] / summary['true']).clip(-1, 1) * 100)
        S = {'details':details, 'summary':summary.drop(columns='mlt').prep(), 'trf':trf, 'imp':imp}
        # S['summary'].disp(5)
        return S, True


code_desc = lambda x: [x+'_code', x+'_desc']
passthru = ['passthrough']
passdrop = passthru
# passdrop = ['passthrough', 'drop']
bintrf = lambda n_bins: KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform', subsample=None)
pwrtrf = make_pipeline(StandardScaler(), PowerTransformer())
kwargs = {
    'term_codes': np.arange(2020,2025)*100+8,
    'infer_term': 202408,
    'show': {
        # 'reg':True,
        # 'adm':True,
    },
    'fill': {
        'birth_day': ['median',['term_code','styp_code']],
        'remote': False,
        'international': False,
        **{f'race_{r}': False for r in ['american_indian','asian','black','pacific','white','hispanic']},
        'lgcy': False,
        'resd': False,
        'waiver': False,
        'fafsa_app': False,
        'schlship_app': False,
        'finaid_accepted': False,
        'ssb': False,
        'math': False,
        'reading': False,
        'writing': False,
        'gap_score': 0,
        'oriented': 'n',
    },
    'attr': [
        # 'index',
        'pidm',
        *code_desc('term'),
        *code_desc('apdc'),
        *code_desc('levl'),
        *code_desc('styp'),
        *code_desc('admt'),
        *code_desc('camp'),
        *code_desc('coll'),
        *code_desc('dept'),
        *code_desc('majr'),
        *code_desc('cnty'),
        *code_desc('stat'),
        *code_desc('natn'),
        *code_desc('resd'),
        *code_desc('lgcy'),
        'international',
        'gender',
        *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
        'waiver',
        'birth_day',
        'distance',
        'hs_qrtl',
    ],
    'cycle_day': (TERM(term_code=202408).cycle_date-pd.Timestamp.now()).days+1,
    # 'cycle_day': 186,
    'crse': [
        # 'engl1301',
        # 'biol1406',
        # 'biol2401',
        # 'math1314',
        # 'math2412',
        # 'agri1419',
        # 'psyc2301',
        # 'ansc1319',
        # 'comm1311',
        # 'hist1301',
        # 'govt2306',
        # 'math1324',
        # 'chem1411',
        # 'univ0301',
        # 'univ0204',
        # 'univ0304',
        # 'agri1100',
        # 'comm1315',
        # 'agec2317',
        # 'govt2305',
        # 'busi1301',
        # 'arts1301',
        # 'math1342',
        # 'math2413',
        ],
    'trf_grid': {
        'appl_day': passthru,
        'apdc_day': passthru,
        'birth_day': [*passthru],#, pwrtrf],#, bintrf(5)],
        # 'levl_code': passthru,
        # 'styp_code': passthru,
        # 'admt_code': passdrop,
        # 'camp_code': passdrop,
        'remote': passdrop,
        'coll_code': passdrop,
        'international': passdrop,
        **{f'race_{r}': passthru for r in ['american_indian','asian','black','pacific','white','hispanic']},
        'gender': passthru,
        'lgcy': passthru,
        'resd': passthru,
        'waiver': passthru,
        # 'fafsa_app': passthru,
        'schlship_app': passthru,
        # 'finaid_accepted': passthru,
        'ssb': passthru,
        'math': passthru,
        'reading': passthru,
        'writing': passthru,
        'gap_score': passthru,
        'oriented': passthru,
        'hs_qrtl': passthru,
        'act_equiv': passthru,
        'distance': [*passthru],#, pwrtrf],#, bintrf(5)],
        },
    'imp_grid': {
        # 'datasets': 1,
        # 'iterations': 1,
        # 'mmc': 5,
        'mmc': range(0, 41, 5),
        # 'mmf': 'mean_match_default',
        # 'mmf': 'mean_match_shap',
        'mmf': ['mean_match_default', 'mean_match_shap'],
        # 'tune': [False, True],
    },
    'overwrite': {
        # 'reg':True,
        # 'adm':True,
        # 'flg':True,
        # 'raw':True,
        # 'term': True,
        # 'raw_df': True,
        # 'reg_df': True,
        # 'X': True,
        # 'Y': True,
        # 'pred': True,
    },
    # 'inspect': True,
}

# FLAGS().run()
self = AMP(**kwargs)
self = self.preprocess()
self.term_codes.remove(self.infer_term)
P = self.main()
# for x in self.params_list:
#     print(x)

{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 0, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,0.884467,11.111111,3.537866,-5.91487,5.362078
_total,n,202108,50%,5.67201,0.431566,-2.281134,-23.859433,8.061036
_total,n,202208,50%,6.078224,-2.008457,-1.374207,-22.938689,8.099894
_total,n,202308,50%,10.688406,2.98913,3.21558,-0.13587,4.257246


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,6.445064,1.941583,4.257246,5.08587,6.711557,8.07075,8.099894


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 0, 'mmf': 'mean_match_shap', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-0.442233,10.834715,10.779436,-5.030404,6.771697
_total,n,202108,50%,6.22688,0.0,2.096178,-23.24291,7.891492
_total,n,202208,50%,6.289641,-2.378436,0.422833,-22.463002,7.888478
_total,n,202308,50%,10.960145,5.842391,6.476449,-0.271739,5.887681


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,7.109837,0.970441,5.887681,6.550693,7.330087,7.889231,7.891492


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 5, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-0.552792,7.739082,3.648425,-1.879491,3.454947
_total,n,202108,50%,5.425401,-0.246609,-2.651048,-21.578298,7.475339
_total,n,202208,50%,6.92389,-2.536998,-1.004228,-22.357294,8.205603
_total,n,202308,50%,9.827899,6.793478,1.585145,-0.362319,4.64221


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,5.944525,2.261996,3.454947,4.345394,6.058775,7.657905,8.205603


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 5, 'mmf': 'mean_match_shap', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-1.271421,11.498065,2.266446,-6.633499,5.417358
_total,n,202108,50%,4.808878,-0.678175,2.15783,-24.845869,8.122688
_total,n,202208,50%,5.073996,-4.439746,-0.951374,-20.137421,7.650634
_total,n,202308,50%,12.771739,6.793478,11.322464,-0.04529,7.733243


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,7.230981,1.226478,5.417358,7.092315,7.691939,7.830604,8.122688


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,0.110558,10.55832,3.703704,-6.301824,5.168601
_total,n,202108,50%,4.377312,0.246609,-0.924784,-23.24291,7.197904
_total,n,202208,50%,4.175476,-1.585624,0.0,-22.991543,7.188161
_total,n,202308,50%,7.744565,5.615942,2.98913,0.362319,4.177989


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,5.933164,1.509942,4.177989,4.920948,6.178381,7.190596,7.197904


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 10, 'mmf': 'mean_match_shap', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-0.276396,9.83969,3.869541,-7.683803,5.417358
_total,n,202108,50%,5.240444,-3.267571,-3.082614,-23.366215,8.739211
_total,n,202208,50%,5.338266,-5.232558,-1.109937,-20.77167,8.113108
_total,n,202308,50%,11.594203,8.740942,3.940217,-1.494565,6.442482


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,7.17804,1.522351,5.417358,6.186201,7.277795,8.269634,8.739211


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 15, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,0.276396,10.834715,8.789386,-3.26147,5.790492
_total,n,202108,50%,5.178792,0.493218,7.151665,-20.961776,8.446363
_total,n,202208,50%,6.976744,-1.479915,0.845666,-19.556025,7.214588
_total,n,202308,50%,9.012681,5.525362,11.231884,0.181159,6.487772


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,6.984803,1.134662,5.790492,6.313452,6.85118,7.522531,8.446363


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 15, 'mmf': 'mean_match_shap', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-1.3267,10.116086,3.206191,-9.065782,5.92869
_total,n,202108,50%,4.31566,-2.096178,-5.178792,-24.784217,9.093711
_total,n,202208,50%,3.382664,-3.488372,-2.219873,-24.365751,8.364165
_total,n,202308,50%,12.63587,7.518116,5.842391,-0.226449,6.555707


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,7.485568,1.488463,5.92869,6.398952,7.459936,8.546552,9.093711


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 20, 'mmf': 'mean_match_default', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating
_total   202208 n creating
_total   202308 n creating


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_term,202008,202108,202208,202308,abs_mean
crse,styp_code,pred_term,err_pct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
_total,n,202008,50%,-1.381979,10.392482,5.030404,-1.658375,4.61581
_total,n,202108,50%,6.041924,-0.308261,-2.281134,-21.393342,7.506165
_total,n,202208,50%,5.919662,-1.744186,-0.739958,-20.295983,7.174947
_total,n,202308,50%,9.873188,6.15942,1.721014,0.951087,4.676178


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
train_term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abs_mean,4.0,5.993275,1.561766,4.61581,4.661086,5.925562,7.257752,7.506165


{'imp': {'datasets': 5, 'iterations': 3, 'mmc': 20, 'mmf': 'mean_match_shap', 'tune': True}, 'trf': {'act_equiv': 'passthrough', 'apdc_day': 'passthrough', 'appl_day': 'passthrough', 'birth_day': 'passthrough', 'coll_code': 'passthrough', 'distance': 'passthrough', 'gap_score': 'passthrough', 'gender': 'passthrough', 'hs_qrtl': 'passthrough', 'international': 'passthrough', 'lgcy': 'passthrough', 'math': 'passthrough', 'oriented': 'passthrough', 'race_american_indian': 'passthrough', 'race_asian': 'passthrough', 'race_black': 'passthrough', 'race_hispanic': 'passthrough', 'race_pacific': 'passthrough', 'race_white': 'passthrough', 'reading': 'passthrough', 'remote': 'passthrough', 'resd': 'passthrough', 'schlship_app': 'passthrough', 'ssb': 'passthrough', 'waiver': 'passthrough', 'writing': 'passthrough'}}
_total   202008 n creating
_total   202108 n creating


KeyboardInterrupt: 

In [8]:
x = mf.mean_match_default.copy()
x.set_mean_match_candidates(40)