In [1]:
from LiveAMP import *
from miceforest import ImputationKernel
from miceforest.mean_matching_functions import default_mean_match, mean_match_kdtree_classification
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn import set_config
set_config(transform_output="pandas")

def feature_importance_df(self, dataset=0, normalize=True, iteration=None):
    imputed_var_names = [self._get_variable_name(int(i)) for i in np.sort(self.imputation_order)]
    predictor_var_names = [self._get_variable_name(int(i)) for i in np.sort(self.predictor_vars)]
    I = pd.DataFrame(self.get_feature_importance(dataset, iteration), index=imputed_var_names, columns=predictor_var_names).T
    return I / I.sum() * 100 if normalize else I
ImputationKernel.feature_importance_df = feature_importance_df

@dataclasses.dataclass
class AMP(MyBaseClass):
    cycle_day: int
    term_codes: typing.List
    infer: int
    crse: typing.List
    feat: typing.Dict
    attr: typing.List
    sch: bool = True
    overwrite: typing.Dict = None
    show: typing.Dict = None

    def dump(self):
        return write(self.rslt, self, overwrite=True)

    def __post_init__(self):
        self.term_codes = listify(self.term_codes)
        D = {'adm':False, 'reg':False, 'flg':False, 'raw':False, 'term':False, 'X':False, 'Y':False, 'Z':False, 'pred':False}
        for x in ['overwrite','show']:
            self[x] = D.copy() if self[x] is None else D.copy() | self[x]
        self.overwrite['Z'] |= self.overwrite['X'] | self.overwrite['Y']
        self.overwrite['raw'] |= self.overwrite['reg'] | self.overwrite['adm'] | self.overwrite['flg']
        self.overwrite['term'] |= self.overwrite['raw']
        self.path = root_path / f"resources/rslt/{rjust(self.cycle_day,3,0)}"
        self.rslt = self.path / f"rslt.pkl"
        self.tune = self.path / f"tune.pkl"
        try:
            self.__dict__ = read(self.rslt).__dict__ | self.__dict__
        except:
            pass
        for k, v in self.overwrite.items():
            if v and k in self:
                del self[k]
        for k in ['pred']:
            self[k] = self[k] if k in self else list()
        for k in ['term']:
            self[k] = self[k] if k in self else dict()

        opts = {x:self[x] for x in ['cycle_day','overwrite','show']}
        for nm in self.term_codes:
            if nm not in self.term:
                print(f'get {nm}')
                self.term[nm] = TERM(term_code=nm, **opts).get_raw()
        return self.dump()

    def get_X(self):
        nm = 'X'
        if nm in self:
            return self
        print(f'get {nm}')
        R = pd.concat([term.raw for term in self.term.values()], ignore_index=True).dropna(axis=1, how='all').prep()
        repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
        R['hs_qrtl'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False).combine_first(R['apdc_code'].map(repl))
        R['remote'] = R['camp_code'] != 's'
        # R['distance'] = R['distance'].fillna(R['distance'].max())
        R['majr_code'] = R['majr_code'].replace({'0000':'und', 'eled':'eted', 'agri':'unda'})
        R['coll_code'] = R['coll_code'].replace({'ae':'an', 'eh':'ed', 'hs':'hl', 'st':'sm'})
        R['coll_desc'] = R['coll_desc'].replace({
            'ag & environmental sciences':'ag & natural resources',
            'education & human development':'education',
            'health science & human service':'health sciences',
            'science & technology':'science & mathematics'})
        majr = ['majr_desc','dept_code','dept_desc','coll_code','coll_desc']
        S = R.sort_values('cycle_date').drop_duplicates(subset='majr_code', keep='last')[['majr_code',*majr]]
        R = R.drop(columns=majr).merge(S, on='majr_code', how='left')

        checks = {
            'cycle_day': R['cycle_day']>=0,
            'apdc_day' : R['apdc_day' ]>=R['cycle_day'],
            'appl_day' : R['appl_day' ]>=R['apdc_day' ],
            'birth_day':(R['birth_day']>=R['appl_day' ]) & (R['birth_day']>=5000),
            'distance': R['distance']>=0,
            'hs_pctl': (R['hs_pctl']>=0) & (R['hs_pctl']<=100),
            'act_equiv': (R['act_equiv']>=1) & (R['act_equiv']<=36),
            'gap_score': (R['gap_score']>=0) & (R['gap_score']<=100),
        }
        for k, mask in checks.items():
            if (~mask).any():
                R[~mask].disp(10)
                raise Exception(f'check failed - {k}')
        self[nm] = R
        return self.dump()

    def preprocess(self):
        nm = 'Z'
        if nm in self:
            return self
        self.get_X()
        print(f'get {nm}')

        where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
        X = where(self.X).set_index(self.attr, drop=False).rename(columns=lambda col:'_'+col)
        X.disp(1)
        self.Z = pd.concat([X.impute(*x) for x in self.feat], axis=1).prep().binarize().categorize()

        g = ['levl_code','styp_code','term_code','crse']
        agg = lambda y, g: where(y).groupby(g)[['credit_hr']].sum()
        with warnings.catch_warnings(action='ignore'):
            Y = {k: pd.concat([term.reg[k].query("crse in @self.crse")[['pidm',*g,'credit_hr']].assign(credit_hr=lambda x: x['credit_hr'].fillna(0)>0) for term in self.term.values()]) for k in ['end','cur']}
        A = agg(Y['end'], g)
        Y = {k: self.Z[[]].join(y.set_index(['pidm','term_code','crse'])['credit_hr'], how='inner') for k, y in Y.items()}
        B = agg(Y['end'], g)
        M = (A / B).query("term_code != @self.infer")
        N = M.reset_index().assign(term_code=self.infer).set_index(M.index.names)
        self.mlt = pd.concat([M, N], axis=0).replace(np.inf, pd.NA).squeeze().rename('mlt').prep()
        self.Y = {k: y.squeeze().unstack().fillna(False).rename(columns=lambda x:f'{x}_{k}') for k, y in Y.items()}


        # where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
        # X = where(self.X).set_index(self.attr, drop=False).rename(columns=lambda col:'_'+col)
        # self.Z = pd.concat([X.impute(*x) for x in self.feat], axis=1).prep().binarize().categorize()
        # with warnings.catch_warnings(action='ignore'):
        #     self.Y = [pd.concat([term.reg[k] for term in self.term.values()]).assign(credit_hr=lambda x:x['credit_hr'].fillna(0)>0) for k in [0,1]]
        #     # self.Z = trf.fit_transform(where(self.X).set_index(self.attr, drop=False)).rename(columns=lambda x:'_'+x)
        #     # self.Z.missing().disp(100)
        #     # for c in ['_hs_qrtl', '_act_equiv']:
        #     #     self.Z[c+'_missing'] = self.Z[c].isnull()
        #     # self.Z = self.Z.prep().binarize().categorize()
        # agg = lambda y, g: where(y).groupby(g)[['credit_hr']].sum()
        # grp = ['levl_code','styp_code','term_code','crse']
        # end = agg(self.Y[0], grp)
        
        # self.Y = [self.Z[[]].join(y.set_index(['pidm','term_code'])[['crse','credit_hr']], how='inner') for y in self.Y]
        # cur = agg(self.Y[0], grp)

        # M = (end / cur).query("term_code != @self.infer")
        # N = M.reset_index().assign(term_code=self.infer).set_index(M.index.names)
        # self.mlt = pd.concat([M, N], axis=0).replace(np.inf, pd.NA).squeeze().rename('mlt').prep()
        return self.dump()


    def predict(self, crse='_total', styp_code='all', train_term=202208, iterations=3, opts=dict()):
        print(crse,train_term,styp_code, end=': ')
        prediction = {'meta': {'crse':crse, 'train_term':train_term, 'styp_code':styp_code, 'iterations':iterations, 'opts':opts.copy()}}
        for P in self.pred:
            if P['meta'] == prediction['meta']:
                print('reusing')
                return P
        print(f'creating')
        # d = {'_total_cur':1, crse+'_cur':1, crse+'_end':0}
        d = {crse+'_cur':1, crse+'_end':0,}
        end = {c:c[:-4] for c, i in d.items() if i==0}
        Y = pd.concat([self.Y[i].query("crse == @crse").rename(columns={'credit_hr':c})[c] for c, i in d.items()], axis=1, join='outer')
        T = self.Z.join(Y, how='left').fillna({c:False for c in d.keys()})
        if styp_code != "all":
            T = T.query("styp_code==@styp_code")
        T.loc[T.eval("term_code==@self.infer"), end.keys()] = pd.NA
        X = T.copy()
        # qry = "term_code!=@train_term"
        qry = "term_code==@train_term"
        X.loc[X.eval(qry), end.keys()] = pd.NA
        model = ImputationKernel(X, **opts)
        model.mice(iterations)
        # with warnings.catch_warnings(action='ignore'):
        #     imp.plot_imputed_distributions(wspace=0.2,hspace=0.4)
            # assert 1==2
        #     imp.plot_mean_convergence()#wspace=0.3, hspace=0.4)
        #     # imp.plot_correlations()

        g = lambda df, nm=None: df[end.keys()].rename(columns=end).melt(ignore_index=False, var_name='crse', value_name=nm).set_index('crse', append=True)
        P = pd.concat([model.complete_data(k).assign(sim=k).set_index('sim', append=True) for k in range(model.dataset_count())])
        Y = g(P,'pred').join(g(T,'true')).assign(train_term=train_term).query(qry).prep()
        grp = ['crse','styp_code','term_code','train_term','sim']
        agg = lambda x: pd.Series({
            'pred': x['pred'].sum(min_count=1),
            'true': x['true'].sum(min_count=1),
            'mse%': ((1*x['pred'] - x['true'])**2).mean()*100,
            'f1_inv%': (1-f1_score(x.dropna()['true'], x.dropna()['pred'], zero_division=np.nan))*100,
        })
        S = Y.groupby(grp).apply(agg).join(self.mlt).rename_axis(index={'term_code':'pred_term'})
        for x in ['pred','true']:
            S[x] = S[x] * S['mlt']
        S.insert(2, 'err', S['pred'] - S['true'])
        S.insert(3, 'err%', (S['err'] / S['true']).clip(-1, 1) * 100)
        prediction['rslt'] = {'X':X,'T':T,'P':P,'model':model, 'full':Y, 'summary': S.drop(columns='mlt').prep()}
        self.pred.append(prediction)
        self.dump()
        return prediction


    def train(self, styp_codes=('n','r','t'), train_terms=None, iterations=3, opts=dict()):
        train_terms = self.term_codes if train_terms is None else train_terms
        def pivot(df, val, q=50):
            Y = df.reset_index().pivot_table(columns='train_term', index='pred_term', values=val, aggfunc=pctl(q))
            for _ in range(2):
                mr = Y.mean(axis=1)
                ma = Y.abs().mean(axis=1)
                Y = (Y.assign(mean=mr, abs_mean=ma) if Y.shape[1] > 1 else Y).T
            return Y.assign(**{val:f"{q}%"}).set_index(val, append=True).swaplevel(0,1).round(2).prep().T
        
        def analyze(df):
            r = {stat: pivot(df.query(f"pred_term!={self.infer}"), stat) for stat in ["err","err%","mse%","f1_inv%"]}
            r['proj'] = pd.concat([pivot(df.query(f"pred_term=={self.infer}"), "pred", q) for q in [25,50,75]], axis=1)
            return r

        P = {(crse, styp_code, train_term): self.predict(crse, styp_code, train_term, iterations, opts) for crse in self.crse for styp_code in listify(styp_codes) for train_term in listify(train_terms)}
        R = dict()
        for k,v in P.items():
            R.setdefault(k[1]=='all', []).append(v)

        for b, L in R.items():
            v = {k: pd.concat([Y['rslt'][k] for Y in L]) for k in ['full','summary']}
            v['opts'] = opts.copy()
            v['rslt'] = {g: analyze(df) for g, df in v['summary'].groupby(['crse', 'styp_code'])}
            R[b] = v
        return R


code_desc = lambda x: [x+'_code', x+'_desc']
simpimp = lambda fill: SimpleImputer(strategy='constant', fill_value=fill, missing_values=pd.NA)
kwargs = {
    'attr': [
        'pidm',
        *code_desc('term'),
        *code_desc('apdc'),
        *code_desc('levl'),
        *code_desc('styp'),
        *code_desc('admt'),
        *code_desc('camp'),
        *code_desc('coll'),
        *code_desc('dept'),
        *code_desc('majr'),
        *code_desc('cnty'),
        *code_desc('stat'),
        *code_desc('natn'),
        'resd',
        'legacy',
        'gender',
        *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
        'waiver',
        'birth_day',
        'distance',
        'hs_pctl',
    ],
    'feat': [
        ['_apdc_day',np.nan],
        ['_appl_day',np.nan],
        ['_gender',np.nan],
        ['_hs_qrtl',np.nan],
        ['_act_equiv',np.nan],
        # ['_distance','max'],
        ['_distance',np.nan],
        ['_remote',False],
        ['_resd',False],
        ['_legacy',False],
        *[[f'_race_{r}',False] for r in ['american_indian','asian','black','pacific','white','hispanic']],
        ['_waiver',False],
        # ['_fafsa_app',False],
        ['_schlship_app',False],
        # ['_finaid_accepted',False],
        ['_ssb',False],
        ['_math',False],
        ['_reading',False],
        ['_writing',False],
        ['_gap_score',0],
        ['_oriented','n'],
        ['_birth_day','median',['term_code','styp_code']],
    ],
    'infer': 202408,
    'cycle_day': (TERM(term_code=202408).cycle_date-pd.Timestamp.now()).days+1,
    # 'cycle_day': 197,
    'term_codes': np.arange(2020,2025)*100+8,
    'crse': [
        '_total',
        'engl1301',
        'biol1406',
        # 'biol2401',
        # 'math1314',
        # 'math2412',
        # 'agri1419',
        # 'psyc2301',
        # 'ansc1319',
        # 'comm1311',
        # 'hist1301',
        # 'govt2306',
        # 'math1324',
        # 'chem1411',
        # 'univ0301',
        # 'univ0204',
        # 'univ0304',
        # 'agri1100',
        # 'comm1315',
        # 'agec2317',
        # 'govt2305',
        # 'busi1301',
        # 'arts1301',
        # 'math1342',
        # 'math2413',
        ],
    'overwrite': {
        # 'reg':True,
        # 'adm':True,
        'flg':True,
        'raw':True,
        # 'term': True,
        # 'X': True,
        'Y': True,
        'Z': True,
        # 'pred': True,
    },
    'show': {
        # 'reg':True,
        # 'adm':True,
    },
    # 'sch': False,
}
# FLAGS().run()
self = AMP(**kwargs)
# self = self.get_X()
self = self.preprocess()
# self.term_codes.remove(self.infer)
# iterations = 3

# opts = dict()
# opts['random_state'] = 42
# opts['save_all_iterations'] = False
# opts['datasets'] = 5
# opts['mean_match_candidates'] = 10
# opts['mean_match_function'] = mean_match_kdtree_classification

# # opts['datasets'] = 2
# # opts['mean_match_candidates'] = 1
# # opts['mean_match_function'] = default_mean_match

# P = self.predict(opts=opts)

# R = self.train(iterations=iterations, opts=opts,
#     styp_codes='n',
#     # train_terms=202208,
#     )
# for k in R[False]['rslt'].keys():
#     for b, v in R.items():
#         print(k, b)
#         v['rslt'][k]['err%'].disp(100)

# tune = []
# for func in [mean_match_kdtree_classification, default_mean_match]:
#     opts['mean_match_function'] = func
#     for cand in range(2,41,3):
#         opts['mean_match_candidates'] = cand
#         print(sort(opts))
#         R = self.train(
#             styp_codes='n',
#             iterations=iterations,
#             opts=opts)
#         R[False]['rslt']['_total','n']['err%'].disp(100)
#         tune.append(R)
#         write(self.tune, tune)

get 202008
raw_202008_192.parq not found - creating
flg_202008_192.parq not found - creating
202006 flags cycle day 198 >= 192 on 2020-02-26 00:00:00 missing columns: ['gap_score', 'app_date', 'ftic_gap_score', 't_gap_score']
202008 flags cycle day 198 >= 192 on 2020-02-26 00:00:00 missing columns: ['app_date', 'ftic_gap_score', 't_gap_score']
get 202108
raw_202108_192.parq not found - creating
flg_202108_192.parq not found - creating
202106 flags cycle day 198 >= 192 on 2021-02-24 00:00:00 missing columns: ['app_date']
202108 flags cycle day 198 >= 192 on 2021-02-24 00:00:00 missing columns: []
get 202208
raw_202208_192.parq not found - creating
flg_202208_192.parq not found - creating
202206 flags cycle day 198 >= 192 on 2022-02-23 00:00:00 missing columns: ['gap_score']
202208 flags cycle day 198 >= 192 on 2022-02-23 00:00:00 missing columns: ['gap_score']
get 202308
raw_202308_192.parq not found - creating
flg_202308_192.parq not found - creating
202306 flags cycle day 194 >= 192 o

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,_cycle_day,_apdc_day,_appl_day,_birth_day,_end_date,_cycle_date,_apdc_date,_appl_date,_birth_date,_term_code_entry,_term_code,_term_desc,_pidm,_id,_appl_no,_levl_code,_levl_desc,_styp_code,_styp_desc,_admt_code,_admt_desc,_apst_code,_apst_desc,_apdc_code,_apdc_desc,_camp_code,_camp_desc,_camp_main,_cnty_code,_cnty_desc,_stat_code,_stat_desc,_zip,_natn_code,_natn_desc,_resd_code,_resd_desc,_resd,_majr_code,_gender,_lgcy_code,_lgcy_desc,_legacy,_race_american_indian,_race_asian,_race_black,_race_pacific,_race_white,_race_hispanic,_hs_pctl,_flg_date,_fafsa_app,_finaid_accepted,_disb_req_complete,_schlship_app,_math,_reading,_writing,_ssb,_waiver,_oriented,_verified,_act_equiv,_distance,_gap_score,_hs_qrtl,_remote,_majr_desc,_dept_code,_dept_desc,_coll_code,_coll_desc
pidm,term_code,term_desc,apdc_code,apdc_desc,levl_code,levl_desc,styp_code,styp_desc,admt_code,admt_desc,camp_code,camp_desc,coll_code,coll_desc,dept_code,dept_desc,majr_code,majr_desc,cnty_code,cnty_desc,stat_code,stat_desc,natn_code,natn_desc,resd,legacy,gender,race_american_indian,race_asian,race_black,race_pacific,race_white,race_hispanic,waiver,birth_day,distance,hs_pctl,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1
25534,202008,fall 2020,a2,"admit, probation (readmit)",ug,undergraduate,r,returning,r,"readmit student, undergraduate",s,stephenville,an,ag & natural resources,wlnr,wildlife & natural resources,wses,wildlife sus & ecosystem sci,25,brown,tx,texas,,,1,1,m,0,0,0,0,1,0,,13286,4504.84,65,192,196,198,13286,2020-09-11,2020-03-03,2020-02-28,2020-02-26,1984-04-27,202006,202008,fall 2020,25534,50238,3,ug,undergraduate,r,returning,r,"readmit student, undergraduate",d,decision made,a2,"admit, probation (readmit)",s,stephenville,1,25,brown,tx,texas,76801,,,r,texas resident,1,wses,m,u,uncle,1,0,0,0,0,1,0,65,NaT,,,,,,,,,,,,,4504.84,,2.0,False,wildlife sus & ecosystem sci,wlnr,wildlife & natural resources,an,ag & natural resources
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


ValueError: No axis named _schlship_app for object type DataFrame

In [None]:
def pd_ext(func):
    def wrapper(X, *args, **kwargs):
        try:
            Y = func(X, *args, **kwargs)
            print(1)
        except:
            Y = pd.DataFrame(X)
            try:
                Y = func(Y, *args, **kwargs)
                print(2)
            except:
                Y = Y.apply(func, *args, **kwargs)
                print(3)
        if isinstance(X, pd.Series):
            try:
                Y = Y.squeeze()
            except:
                pass
        return Y
    wrapper.__name__ = func.__name__
    return wrapper

@pd_ext
def binarize(ser):
    assert isinstance(ser, pd.Series)
    s = set(ser.dropna())
    if s:
        if s.issubset({'y','Y'}):
            ser = ser.notnull().astype('boolean')
        elif s.issubset({0,1}):
            ser = ser.astype('boolean')
    return ser

for func in [disp, to_numeric, prep, categorize, binarize, rnd, vc, missing, impute, unmelt]:
    for cls in [pd.DataFrame, pd.Series]:
        setattr(cls, func.__name__, func)

# self.X['schlship_app'].value_counts()
# self.X['fafsa_app'].value_counts()
# self.X['schlship_app'].dtype
# A = self.X.binarize()['schlship_app']
# A['schlship_app']
binarize(self.X)['schlship_app']

In [None]:
self.X['schlship_app'].groupby

In [None]:
self.X.columns.sort_values()

In [None]:
self.Y['end']

In [None]:
where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
X = where(self.X).set_index(self.attr, drop=False).rename(columns=lambda col:'_'+col)
self.Z = pd.concat([X.impute(*x) for x in self.feat], axis=1).prep().binarize().categorize()

g = ['levl_code','styp_code','term_code','crse']
with warnings.catch_warnings(action='ignore'):
    Y = {k: pd.concat([term.reg[k].query("crse in @self.crse")[['pidm',*g,'credit_hr']].assign(credit_hr=lambda x: x['credit_hr'].fillna(0)>0) for term in self.term.values()]) for k in ['end','cur']}
agg = lambda y, g: where(y).groupby(g)[['credit_hr']].sum()
A = agg(Y['end'], g)
Y = {k: self.Z[[]].join(y.set_index(['pidm','term_code','crse'])['credit_hr'], how='inner') for k, y in Y.items()}
B = agg(Y['end'], g)
self.mlt = A / B

self.Y = {k: y.squeeze().unstack().fillna(False).rename(columns=lambda x:f'{x}_{k}') for k, y in Y.items()}
Y['end']
# M = (end / cur).query("term_code != @self.infer")
# N = M.reset_index().assign(term_code=self.infer).set_index(M.index.names)
# self.mlt = pd.concat([M, N], axis=0).replace(np.inf, pd.NA).squeeze().rename('mlt').prep()


# end
self.Y['end']
# Y = [self.Y[0].query("crse in @self.crse").set_index('crse', append=True).unstack().droplevel(0,1).rename(columns=lambda x:f"_{x}_end")
# # Y.droplevel?
# Y.disp(1)


In [None]:
Y['end']

In [None]:
# def impute(df, col, val=None, grp=None):
#     val = val if val is not None else 'median' if pd.api.types.is_numeric_dtype(df[col]) else 'mode'
#     if val in ['median']:
#         func = lambda x: x.median()
#     elif val in ['mean','ave','avg','average']:
#         func = lambda x: x.mean()
#     elif val in ['mode','most_frequent']:
#         func = lambda x: x.mode()[0]
#     else:
#         func = lambda x: val
#     df[col] = (df if grp is None else df.groupby(grp))[col].transform(lambda x: x.fillna(func(x)))
#     return df
# pd.DataFrame.impute = impute

self.Z.reset_index(drop=True)
A = self.Z.copy()
c = '_birth_day'
mask = A[c].isnull()
# A.impute('_birth_day', val='median', grp=['term_code','styp_code'])
# A.impute('_birth_day', val=np.nan, grp=['term_code','styp_code'])
A.impute('_birth_day', val=np.nan, grp=['term_code','styp_code'])
A.loc[mask,c].disp(5)
# A.groupby(['term_code','styp_code'])['_birth_day'].median()

In [None]:
# trf = ColumnTransformer(self.feat, remainder='drop',verbose_feature_names_out = False)
# where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
# with warnings.catch_warnings(action='ignore'):
#     self.Y = [pd.concat([term.reg[k] for term in self.term.values()]).assign(credit_hr=lambda x:x['credit_hr'].fillna(0)>0) for k in [0,1]]
#     self.Z = trf.fit_transform(where(self.X).set_index(self.attr, drop=False)).rename(columns=lambda x:'_'+x)

kwargs = {
    'feat': [
        ['_gender',np.nan],
        ['_appl_day',np.nan],
        ['_apdc_day',np.nan],
        ['_hs_qrtl',np.nan],
        ['_act_equiv',np.nan],
        ['_remote',False],
        ['_resd',False],
        ['_legacy',False],
        *[[f'_race_{r}',False] for r in ['american_indian','asian','black','pacific','white','hispanic']],
        ['_waiver',False],
        # ['_fafsa_app',False],
        ['_schlship_app',False],
        # ['_finaid_accepted',False],
        ['_ssb',False],
        ['_math',False],
        ['_reading',False],
        ['_writing',False],
        ['_gap_score',0],
        ['_oriented','n'],
        ['_distance','max'],
        ['_birth_day','median',['term_code','styp_code']],
    ],
}
where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
g = lambda col:'_'+col
# cols = [x[0] for x in kwargs['feat']]
# where(self.X).rename(columns=g)[cols].isnull().sum().disp(1000)
X = where(self.X).set_index(self.attr, drop=False).rename(columns=g)
Z = pd.concat([X.impute(*x) for x in self.feat], axis=1).prep().binarize().categorize()
Z.isnull().sum().disp(1000)
Z.dtypes
# L = [Z.impute(col, *val)]
# L = [[col, *listify(val)] for C, val in kwargs['feat'] for col in listify(C)]
# L = [Z.impute(g(col), *listify(val)) for C, val in kwargs['feat'] for col in listify(C)]
# L
#).rename(columns=lambda x:'_'+x)


In [None]:
pd.get_dummies(Z).disp(1)

In [None]:
G = read('/home/scook/institutional_data_analytics/admitted_matriculation_projection/LiveAMP/flags/parq/flg_202308.parq', columns=['gender'])

In [None]:
G.value_counts()

In [None]:
db.head('spbpers')

In [None]:
qry = "select spbpers_sex, count(*) from spbpers group by spbpers_sex"
db.execute(qry)

In [None]:
from LiveAMP import *
from miceforest import ImputationKernel
from miceforest.mean_matching_functions import default_mean_match, mean_match_kdtree_classification
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn import set_config
set_config(transform_output="pandas")

def feature_importance_df(self, dataset=0, normalize=True, iteration=None):
    imputed_var_names = [self._get_variable_name(int(i)) for i in np.sort(self.imputation_order)]
    predictor_var_names = [self._get_variable_name(int(i)) for i in np.sort(self.predictor_vars)]
    I = pd.DataFrame(self.get_feature_importance(dataset, iteration), index=imputed_var_names, columns=predictor_var_names).T
    return I / I.sum() * 100 if normalize else I
ImputationKernel.feature_importance_df = feature_importance_df

@dataclasses.dataclass
class AMP(MyBaseClass):
    cycle_day: int
    term_codes: typing.List
    infer: int
    crse: typing.List
    feat: typing.Dict
    attr: typing.List
    sch: bool = True
    overwrite: typing.Dict = None
    show: typing.Dict = None

    def dump(self):
        return write(self.rslt, self, overwrite=True)

    def __post_init__(self):
        # check feat lists are disjoint
        L = [x for f in self.feat for x in f[-1]]
        assert len(L) == len(set(L))

        self.term_codes = listify(self.term_codes)
        D = {'adm':False, 'reg':False, 'flg':False, 'raw':False, 'term':False, 'X':False, 'Y':False, 'Z':False, 'pred':False}
        for x in ['overwrite','show']:
            self[x] = D.copy() if self[x] is None else D.copy() | self[x]
        self.overwrite['Z'] |= self.overwrite['X'] | self.overwrite['Y']
        self.overwrite['raw'] |= self.overwrite['reg'] | self.overwrite['adm'] | self.overwrite['flg']
        self.overwrite['term'] |= self.overwrite['raw']
        self.path = root_path / f"rslt/{rjust(self.cycle_day,3,0)}"
        self.rslt = self.path / f"rslt.pkl"
        self.tune = self.path / f"tune.pkl"
        try:
            self.__dict__ = read(self.rslt).__dict__ | self.__dict__
        except:
            pass
        for k, v in self.overwrite.items():
            if v and k in self:
                del self[k]
        for k in ['pred']:
            self[k] = self[k] if k in self else list()
        for k in ['term']:
            self[k] = self[k] if k in self else dict()

        opts = {x:self[x] for x in ['cycle_day','overwrite','show']}
        for nm in self.term_codes:
            if nm not in self.term:
                print(f'get {nm}')
                self.term[nm] = TERM(term_code=nm, **opts).get_raw()
        return self.dump()

    def get_X(self):
        nm = 'X'
        if nm in self:
            return self
        print(f'get {nm}')
        R = pd.concat([term.raw for term in self.term.values()], ignore_index=True).dropna(axis=1, how='all').prep()
        repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
        R['hs_qrtl'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False).combine_first(R['apdc_code'].map(repl))
        R['camp_main'] = R['camp_code'] == 's'
        R['distance'] = R['distance'].fillna(R['distance'].max())
        R['majr_code'] = R['majr_code'].replace({'0000':'und', 'eled':'eted', 'agri':'unda'})
        R['coll_code'] = R['coll_code'].replace({'ae':'an', 'eh':'ed', 'hs':'hl', 'st':'sm'})
        R['coll_desc'] = R['coll_desc'].replace({
            'ag & environmental sciences':'ag & natural resources',
            'education & human development':'education',
            'health science & human service':'health sciences',
            'science & technology':'science & mathematics'})
        majr = ['majr_desc','dept_code','dept_desc','coll_code','coll_desc']
        S = R.sort_values('cycle_date').drop_duplicates(subset='majr_code', keep='last')[['majr_code',*majr]]
        R = R.drop(columns=majr).merge(S, on='majr_code', how='left')

        checks = {
            'cycle_day': R['cycle_day']>=0,
            'apdc_day' : R['apdc_day' ]>=R['cycle_day'],
            'appl_day' : R['appl_day' ]>=R['apdc_day' ],
            'birth_day':(R['birth_day']>=R['appl_day' ]) & (R['birth_day']>=5000),
            'distance': R['distance']>=0,
            'hs_pctl': (R['hs_pctl']>=0) & (R['hs_pctl']<=100),
            'act_equiv': (R['act_equiv']>=1) & (R['act_equiv']<=36),
            'gap_score': (R['gap_score']>=0) & (R['gap_score']<=100),
        }
        for k, mask in checks.items():
            if (~mask).any():
                R[~mask].disp(10)
                raise Exception(f'check failed - {k}')
        self[nm] = R
        return self.dump()

    def preprocess(self):
        nm = 'Z'
        if nm in self:
            return self
        self.get_X()
        print(f'get {nm}')

        trf = ColumnTransformer(self.feat, remainder='drop',verbose_feature_names_out = False)
        where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
        with warnings.catch_warnings(action='ignore'):
            self.Y = [pd.concat([term.reg[k] for term in self.term.values()]).assign(credit_hr=lambda x:x['credit_hr'].fillna(0)>0) for k in [0,1]]
            self.Z = trf.fit_transform(where(self.X).set_index(self.attr, drop=False)).rename(columns=lambda x:'_'+x)
            self.Z.missing().disp(100)
            for c in ['_hs_qrtl', '_act_equiv']:
                self.Z[c+'_missing'] = self.Z[c].isnull()
            self.Z = self.Z.prep().binarize().categorize()
        agg = lambda y, g: where(y).groupby(g)[['credit_hr']].sum()
        grp = ['levl_code','styp_code','term_code','crse']
        end = agg(self.Y[0], grp)
        
        self.Y = [self.Z[[]].join(y.set_index(['pidm','term_code'])[['crse','credit_hr']], how='inner') for y in self.Y]
        cur = agg(self.Y[0], grp)

        M = (end / cur).query("term_code != @self.infer")
        N = M.reset_index().assign(term_code=self.infer).set_index(M.index.names)
        self.mlt = pd.concat([M, N], axis=0).replace(np.inf, pd.NA).squeeze().rename('mlt').prep()
        return self.dump()


    def predict(self, crse='_total', styp_code='all', train_term=202208, iterations=3, opts=dict()):
        print(crse,train_term,styp_code, end=': ')
        prediction = {'meta': {'crse':crse, 'train_term':train_term, 'styp_code':styp_code, 'iterations':iterations, 'opts':opts.copy()}}
        for P in self.pred:
            if P['meta'] == prediction['meta']:
                print('reusing')
                return P
        print(f'creating')
        # d = {'_total_cur':1, crse+'_cur':1, crse+'_end':0}
        d = {crse+'_cur':1, crse+'_end':0,}
        end = {c:c[:-4] for c, i in d.items() if i==0}
        Y = pd.concat([self.Y[i].query("crse == @crse").rename(columns={'credit_hr':c})[c] for c, i in d.items()], axis=1, join='outer')
        T = self.Z.join(Y, how='left').fillna({c:False for c in d.keys()})
        if styp_code != "all":
            T = T.query("styp_code==@styp_code")
        T.loc[T.eval("term_code==@self.infer"), end.keys()] = pd.NA
        X = T.copy()
        # qry = "term_code!=@train_term"
        qry = "term_code==@train_term"
        X.loc[X.eval(qry), end.keys()] = pd.NA
        model = ImputationKernel(X, **opts)
        model.mice(iterations)
        # with warnings.catch_warnings(action='ignore'):
        #     imp.plot_imputed_distributions(wspace=0.2,hspace=0.4)
            # assert 1==2
        #     imp.plot_mean_convergence()#wspace=0.3, hspace=0.4)
        #     # imp.plot_correlations()

        g = lambda df, nm=None: df[end.keys()].rename(columns=end).melt(ignore_index=False, var_name='crse', value_name=nm).set_index('crse', append=True)
        P = pd.concat([model.complete_data(k).assign(sim=k).set_index('sim', append=True) for k in range(model.dataset_count())])
        Y = g(P,'pred').join(g(T,'true')).assign(train_term=train_term).query(qry).prep()
        grp = ['crse','styp_code','term_code','train_term','sim']
        agg = lambda x: pd.Series({
            'pred': x['pred'].sum(min_count=1),
            'true': x['true'].sum(min_count=1),
            'mse%': ((1*x['pred'] - x['true'])**2).mean()*100,
            'f1_inv%': (1-f1_score(x.dropna()['true'], x.dropna()['pred'], zero_division=np.nan))*100,
        })
        S = Y.groupby(grp).apply(agg).join(self.mlt).rename_axis(index={'term_code':'pred_term'})
        for x in ['pred','true']:
            S[x] = S[x] * S['mlt']
        S.insert(2, 'err', S['pred'] - S['true'])
        S.insert(3, 'err%', (S['err'] / S['true']).clip(-1, 1) * 100)
        prediction['rslt'] = {'X':X,'T':T,'P':P,'model':model, 'full':Y, 'summary': S.drop(columns='mlt').prep()}
        self.pred.append(prediction)
        self.dump()
        return prediction


    def train(self, styp_codes=('n','r','t'), train_terms=None, iterations=3, opts=dict()):
        train_terms = self.term_codes if train_terms is None else train_terms
        def pivot(df, val, q=50):
            Y = df.reset_index().pivot_table(columns='train_term', index='pred_term', values=val, aggfunc=pctl(q))
            for _ in range(2):
                mr = Y.mean(axis=1)
                ma = Y.abs().mean(axis=1)
                Y = (Y.assign(mean=mr, abs_mean=ma) if Y.shape[1] > 1 else Y).T
            return Y.assign(**{val:f"{q}%"}).set_index(val, append=True).swaplevel(0,1).round(2).prep().T
        
        def analyze(df):
            r = {stat: pivot(df.query(f"pred_term!={self.infer}"), stat) for stat in ["err","err%","mse%","f1_inv%"]}
            r['proj'] = pd.concat([pivot(df.query(f"pred_term=={self.infer}"), "pred", q) for q in [25,50,75]], axis=1)
            return r

        P = {(crse, styp_code, train_term): self.predict(crse, styp_code, train_term, iterations, opts) for crse in self.crse for styp_code in listify(styp_codes) for train_term in listify(train_terms)}
        R = dict()
        for k,v in P.items():
            R.setdefault(k[1]=='all', []).append(v)

        for b, L in R.items():
            v = {k: pd.concat([Y['rslt'][k] for Y in L]) for k in ['full','summary']}
            v['opts'] = opts.copy()
            v['rslt'] = {g: analyze(df) for g, df in v['summary'].groupby(['crse', 'styp_code'])}
            R[b] = v
        return R


code_desc = lambda x: [x+'_code', x+'_desc']
simpimp = lambda fill: SimpleImputer(strategy='constant', fill_value=fill, missing_values=pd.NA)
kwargs = {
    'attr': [
        'pidm',
        *code_desc('term'),
        *code_desc('apdc'),
        *code_desc('levl'),
        *code_desc('styp'),
        *code_desc('admt'),
        *code_desc('camp'),
        *code_desc('coll'),
        *code_desc('dept'),
        *code_desc('majr'),
        *code_desc('cnty'),
        *code_desc('stat'),
        *code_desc('natn'),
        'resd',
        'legacy',
        'gender',
        *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
        'waiver',
        'birth_day',
        'distance',
        'hs_pctl',
    ],
    'feat': [
        ('scl', make_pipeline(StandardScaler(), PowerTransformer()), [
            'distance',
            'birth_day',
            # 'gap_score',
            # 'hs_pctl',
            'act_equiv',
        ]),
        ('pass', 'passthrough', [
            'gender',
            # 'styp_code',
            # 'camp_code',
            # 'coll_code',
            # 'verified',
            # 'term_code',
            'appl_day',
            'apdc_day',
            'hs_qrtl',
        ]),
        ('false', simpimp(False), [
            'camp_main',
            'resd',
            'legacy',
            *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
            'waiver',
            # 'fafsa_app',
            'schlship_app',
            # 'finaid_accepted',
            'ssb',
            'math',
            'reading',
            'writing',
        ]),
        ('0', simpimp(0), [
            'gap_score',
        ]),
        ('n', simpimp('n'), [
            'oriented',
        ]),
    ],
    'infer': 202408,
    'cycle_day': (TERM(term_code=202408).cycle_date-pd.Timestamp.now()).days+1,
    # 'cycle_day': 197,
    'term_codes': np.arange(2020,2025)*100+8,
    'crse': [
        '_total',
        # 'engl1301',
        # 'biol1406',
        # 'biol2401',
        # 'math1314',
        # 'math2412',
        # 'agri1419',
        # 'psyc2301',
        # 'ansc1319',
        # 'comm1311',
        # 'hist1301',
        # 'govt2306',
        # 'math1324',
        # 'chem1411',
        # 'univ0301',
        # 'univ0204',
        # 'univ0304',
        # 'agri1100',
        # 'comm1315',
        # 'agec2317',
        # 'govt2305',
        # 'busi1301',
        # 'arts1301',
        # 'math1342',
        # 'math2413',
        ],
    'overwrite': {
        # 'reg':True,
        # 'adm':True,
        # 'flg':True,
        # 'raw':True,
        # 'term': True,
        # 'X': True,
        # 'Y': True,
        # 'Z': True,
        'pred': True,
    },
    'show': {
        # 'reg':True,
        # 'adm':True,
    },
    # 'sch': False,
}
# FLAGS().run()
self = AMP(**kwargs)
self = self.preprocess()
self.term_codes.remove(self.infer)
iterations = 3

opts = dict()
opts['random_state'] = 42
opts['save_all_iterations'] = False
opts['datasets'] = 5
opts['mean_match_candidates'] = 10
opts['mean_match_function'] = mean_match_kdtree_classification

# # opts['datasets'] = 2
# # opts['mean_match_candidates'] = 1
# # opts['mean_match_function'] = default_mean_match

# P = self.predict(opts=opts)

R = self.train(iterations=iterations, opts=opts,
    styp_codes='n',
    # train_terms=202208,
    )
# for k in R[False]['rslt'].keys():
#     for b, v in R.items():
#         print(k, b)
#         v['rslt'][k]['err%'].disp(100)

# tune = []
# for func in [mean_match_kdtree_classification, default_mean_match]:
#     opts['mean_match_function'] = func
#     for cand in range(2,41,3):
#         opts['mean_match_candidates'] = cand
#         print(sort(opts))
#         R = self.train(
#             styp_codes='n',
#             iterations=iterations,
#             opts=opts)
#         R[False]['rslt']['_total','n']['err%'].disp(100)
#         tune.append(R)
#         write(self.tune, tune)

In [None]:
x = None
match x:
    case 2:
        print(2)
    case 10:
        print(11)
    case None:
        print('hi')

In [None]:
self.X['styp_code'].mode()
# self.Z['_birth_day']['median']()
df = pd.DataFrame()
df['a'] = [1,1,2,2]
df['b'] = ['a','a','a','a',]
df.mode()

In [None]:
def impute(df, col, val=None, grp=None):
    val = val if val is not None else 'median' if pd.api.types.is_numeric_dtype(df[col]) else 'mode'
    if val in ['median']:
        func = lambda x: x.median()
    elif val in ['mean','ave','avg','average']:
        func = lambda x: x.mean()
    elif val in ['mode','most_frequent']:
        func = lambda x: x.mode()[0]
    else:
        func = lambda x: val
    df[col] = (df if grp is None else df.groupby(grp))[col].transform(lambda x: x.fillna(func(x)))
    return df
pd.DataFrame.impute = impute

self.Z.reset_index(drop=True)
A = self.Z.copy()
c = '_birth_day'
mask = A[c].isnull()
# A.impute('_birth_day', val='median', grp=['term_code','styp_code'])
A.impute('_birth_day', val=np.nan, grp=['term_code','styp_code'])
A.loc[mask,c].disp(5)
# A.groupby(['term_code','styp_code'])['_birth_day'].median()

In [None]:
.102924

In [None]:
P = self.pred[0]
R = P['rslt']
self.Z.dtypes
# R['P'].dtypes#.values.astype(float)
# model = self.pred[0]['rslt']['model']
# model.feature_importance_df()
# model.plot_correlations()

In [None]:
self.pred[0]

In [None]:
write(self.path / 'predictions.csv', R[False]['summary'])
write(self.path / 'predictions.parq', R[False]['summary'])

In [None]:
self.pred

In [None]:
R = pd.concat([term.raw for term in self.term.values()]).dropna(axis=1, how='all').reset_index(drop=True).prep()
repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
R['hs_qrtl'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False).combine_first(R['apdc_code'].map(repl))

In [None]:
R = pd.concat([term.raw for term in self.term.values()]).dropna(axis=1, how='all').reset_index(drop=True).prep()
repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
# R['hs_qrtl'] = 
R['A'] = pd.cut(R['hs_pctl'], bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0], right=False)
R['B'] = R['apdc_code'].map(repl)
A
# R['hs_qrtl'] = R['A'].combine_first(R['B'])
# pd.concat([A,B],axis=1)
R
# A

In [None]:
db.head('stvapdc', 200)

In [None]:
where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()
idx = ['pidm','styp_code','apdc_code','apdc_desc']
# P = self.X.set_index(idx)[['hs_pctl']]
P = where(self.X).filter([*idx, 'hs_pctl'])
repl = {
    # 'a2':pd.NA,
    # 'aa':pd.NA,
    # 'ac':pd.NA,
    # 'ad':pd.NA,
    'ag':pd.NA,
    'ai':pd.NA,
    'at':pd.NA,
    'ae':0,
    'n1':1,
    'n2':2,
    'n3':3,
    'n4':4,
    'r1':1,
    'r2':2,
    'r3':3,
    'r4':4,
}
repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}

# bins = [100,89.9,74.9,49.9,24.9,0]
# bool, default False
repl = {'ae':0, 'n1':1, 'n2':2, 'n3':3, 'n4':4, 'r1':1, 'r2':2, 'r3':3, 'r4':4}
P['hs_qrtl'] = pd.cut(P['hs_pctl'], right=False, bins=[-1,25,50,75,90,101], labels=[4,3,2,1,0]).combine_first(P['apdc_code'].map(repl))
# P.query('hs_qrtl==2')
# P.query("apdc_code=='n2'")
# P.vc(['apdc_desc','hs_qrtl']).disp(200)
# Q = P.query("hs_qrtl.isnull()")
# P.groupby(['apdc_code','hs_qrtl']).size()
P.groupby(['apdc_code','apdc_desc'])['hs_qrtl'].value_counts(normalize=True, dropna=False).round(2).sort_index().to_frame().disp(200)
# Q.vc(['styp_code','apdc_code','apdc_desc']).disp(200)
# P['hs_qrtl'].isnull().sum()
# P.query("hs_qrtl.isnull()").vc('apdc_desc')

In [None]:
bins = [100,89.9,74.9,49.9,24.9,0]
np.arange(4,-1,-1)

In [None]:
# P.query("apdc_code=='n2' & hs_pctl.notnull()" ).disp(2000)
P.query("apdc_code=='n2'" ).disp(2000)
# P.query("apdc_code=='n2'").vc('hs_qrtl')

In [None]:
P.query("apdc_desc=='admitted (nr1)' & hs_qrtl==2")

In [None]:
P.query('hs_pctl.isnull()').vc('apdc_desc')

In [None]:
repl = {
    'a2':pd.NA,
    'aa':pd.NA,
    'ac':pd.NA,
    'ad':pd.NA,
    'ae':5,
    'ag':pd.NA,
    'ai':pd.NA,
    'at':pd.NA,
    'n1':1,
    'n2':3,
    'n3':4,
    'n4':4,
    'r1':1,
    'r2':2,
    'r3':3,
    'r4':4,
}
P['q'] = P['']

In [None]:
P.vc(['apdc_code','apdc_desc'])
# {'n1':1}
# set(P.reset_index()['apdc_code'])

In [None]:
# self.Z.filter(like='_hs_pctl').query('_hs_pctl.isnull()').vc('apdc_desc')
# # self.X.groupby('apdc_desc')['hs_pctl'].describe()
# P = self.X[['apdc_desc','hs_pctl']]
# pd.cut(self.X['hs_pctl'],4)
# P = pd.cut(self.X.set_index(['pidm','apdc_desc'])['hs_pctl'], bins=[-1,25,50,75,100], labels=[1,2,3,4])
P.vc(['apdc_desc','hs_qrtl']).disp(200)
# P.groupby('apdc_desc').describe()
# (P==2).sum()

In [None]:
R[False]['rslt'][('_total', 'n')]['proj']

In [None]:
M = A['summary'].query("pred_term!=202408 & styp_code=='n' & pred_term!=train_term")#['err%']
import seaborn as sns
sns.boxplot(M, hue='train_term', y='err%', x='pred_term',
    # fill=False,
    whis=(0, 100),
    dodge = True,
    palette='tab10',
    )

In [None]:
self.pred[0]['rslt']['Pmodel'].feature_importance_df().sort_values('_total_end', ascending=False)

In [None]:
# R[False]['rslt']['_total','n']['err%']
R[False]['rslt']['_total','n'].keys()#['model']

In [None]:
P['rslt']['model'].feature_importance_df().sort_values('_total_end', ascending=False)

In [None]:
def feature_importance_df(self, dataset, normalize=True, iteration=None):
    imputed_var_names = [self._get_variable_name(int(i)) for i in np.sort(self.imputation_order)]
    predictor_var_names = [self._get_variable_name(int(i)) for i in np.sort(self.predictor_vars)]
    I = pd.DataFrame(self.get_feature_importance(datset, iteration), index=imputed_var_names, columns=predictor_var_names).T
    return I / I.sum() if normalize else I
ImputationKernel.feature_importance_df = feature_importance_df

In [None]:
model.plot_feature_importance?

In [None]:
model = self.pred[0]['rslt']['model']
# model.plot_feature_importance??
imputed_var_names = [model._get_variable_name(int(i)) for i in np.sort(model.imputation_order)]
predictor_var_names = [model._get_variable_name(int(i)) for i in np.sort(model.predictor_vars)]
# model.
c = '_total_end'
I = pd.DataFrame(model.get_feature_importance(0), index=imputed_var_names, columns=predictor_var_names).T
I *= 100 / I.sum()
I[c].sort_values(ascending=False)
# I.T['_total_end']
# (0).shape
#(0)

In [None]:
R[False]['rslt']['_total','n'].keys()

# ['rslt']['_total','n'].keys()
# model = R[False]['rslt']['_total','n']
#['model']
# model.plot_feature_importance?
# (dataset=0, annot=True,cmap="YlGnBu",vmin=0, vmax=1)

In [None]:
sum((len(f[-1]) for f in self.feat))
L = [x for f in self.feat for x in f[-1]]
len(L), len(set(L))
# {x for f in self.feat for x in f[-1]}
# {*self.feat[0][-1]}

In [None]:
F = read('/home/scook/institutional_data_analytics/admitted_matriculation_projection/LiveAMP/flags/parq/flg_202308.parq')

In [None]:
F.columns
F['styp_code']

In [None]:
# self.Z.isnull().sum().sort_index().disp(1000)
# self.Z.dtypes
# .vc('oriented')
# hs_pctlact_equiv
mask = self.Z['birth_day'].isnull()
self.Z[mask]

In [None]:
qry = "select * from spbpers where spbpers_pidm=1115874"
db.execute(qry)

In [None]:
self.Z.select_dtypes('string')

In [None]:
self.Z.isnull().sum().sort_values(ascending=False).to_frame('missing').query('missing>0')
# self.Z.vc('writing')
# self.Z.dtypes

In [None]:
from sklearn.impute import SimpleImputer
feat = [
    ('scl', make_pipeline(StandardScaler(), PowerTransformer()), [
        'distance',
        'birth_day',
    ]),
    # ('nom', FunctionTransformer(lambda x: x.astype('category')), [
    ('nom', 'passthrough', [
        'gender',
        'oriented',
        'styp_code',
        # 'camp_code',
        'coll_code',
        # 'verified',
    ]),
    ('pass', 'passthrough', [
        'term_code',
        'math',
        'reading',
        'writing',
        'hs_pctl',
        'appl_day',
        'apdc_day',
        'act_equiv',
    ]),
    ('false', SimpleImputer(strategy='constant', fill_value=False), [
        'camp_main',
        'resd',
        'legacy',
        *[f'race_{r}' for r in ['american_indian','asian','black','pacific','white','hispanic']],
        'waiver',
        # 'fafsa_app',
        'schlship_app',
        # 'finaid_accepted',
        'ssb',
    ]),
    ('0', SimpleImputer(strategy='constant', fill_value=0), [
        'gap_score',
    ]),
    # ('n', SimpleImputer(strategy='constant', fill_value='n'), [
    #     'oriented',
    # ]),

]

# trf = make_pipeline(ColumnTransformer(feat,remainder='drop',verbose_feature_names_out = False), ft)
# trf = ColumnTransformer(feat,remainder='drop',verbose_feature_names_out = False)
# Z = trf.fit_transform(self.X).binarize()
# # Z = Z.apply(f)
# # Z.isnull().sum()
# Z.dtypes
self.X.fillna({c:'' for c in self.X.select_dtypes('string').columns}, inplace=True)
self.X.select_dtypes('string').isnull().sum().disp(300)
# self.X.select_dtypes('string').fillna('')
# self.X.select_dtypes('string').isnull().sum()

# .fillna('')
# Z
# pd.api.types.is_string_dtype(Z['gender'])

In [None]:
self.Z.isnull().sum()

In [None]:
db.head('opeir.admissions_fall2022',2).T.sort_index()

In [None]:
self.Z.waiver

In [None]:
self.X.query("waiver.isnull()").vc(['cycle_day'])

In [None]:
self.X.isnull().sum().disp(1000)

In [None]:
R[False]['rslt']['_total','n']['err%'].disp(100)

In [None]:
for k in R[False]['rslt'].keys():
    for b, v in R.items():
        print(k, b)
        v['rslt'][k]['err%'].disp(100)

In [None]:
# {k:v for k,v in R.items() if k[1]!='all'}.keys()
R = {True:[], False:[]}
for k,v in P.items():
    R[k[1]=='all'].append(v)
# q[True][0]['rslt'].keys()
# for b,L in R.items():
    # print(type(v))
    # print(v[0]['rslt'].keys())

S = {b: {k: pd.concat([Y['rslt'][k] for Y in L]) for k in ['full','summary']} for b,L in R.items()}
S[False]['summary']

In [None]:
    # def predict(self, crse='_total', train_term=202208, iterations=3, opts=dict()):
    #     for styp_code in ["n","r","t","all"]:
    #         print(crse,train_term,styp_code, end=": ")
    #         prediction = {'meta': {'crse':crse, 'train_term':train_term, 'styp_code':styp_code, 'iterations':iterations, 'opts':opts.copy()}}
    #         for P in self.pred:
    #             if P['meta'] == prediction['meta']:
    #                 print('reusing')
    #                 return P
    #         print(f'creating')

    #         d = {'_total_cur':1, crse+'_cur':1, crse+'_end':0}
    #         end = {c:c[:-4] for c, i in d.items() if i==0}
    #         Y = pd.concat([self.Y[i].query("crse == @crse").rename(columns={'credit_hr':c})[c] for c, i in d.items()], axis=1, join='outer')
    #         T = self.Z.join(Y, how='left').fillna({c:False for c in d.keys()})
    #         if styp_code != "all":
    #             T = T.query("styp_code==@styp_code")
    #         X = T.copy()
    #         X.loc[X.eval("term_code!=@train_term or term_code==@self.infer"), end.keys()] = pd.NA
    #         imp = ImputationKernel(X, **opts)
    #         imp.mice(iterations)
    #         # with warnings.catch_warnings(action='ignore'):
    #         #     imp.plot_imputed_distributions(wspace=0.2,hspace=0.4)
    #         #     imp.plot_mean_convergence()#wspace=0.3, hspace=0.4)
    #         #     # imp.plot_correlations()

    #         g = lambda df, nm=None: df[end.keys()].rename(columns=end).melt(ignore_index=False, var_name='crse', value_name=nm).set_index('crse', append=True)
    #         P = pd.concat([imp.complete_data(k).assign(sim=k).set_index('sim', append=True) for k in range(imp.dataset_count())])
    #         Y = g(P,'pred').join(g(T,'true')).assign(train_term=train_term).query('term_code != train_term').prep()
    #         grp = ['crse','styp_code','term_code','train_term','sim']
    #         agg = lambda x: pd.Series({
    #             'pred': x['pred'].sum(min_count=1),
    #             'true': x['true'].sum(min_count=1),
    #             'mse%': ((1*x['pred'] - x['true'])**2).mean()*100,
    #             'f1_inv%': (1-f1_score(x.dropna()['true'], x.dropna()['pred'], zero_division=np.nan))*100,
    #         })
    #         S = Y.groupby(grp).apply(agg).join(self.mlt).rename_axis(index={'term_code':'pred_term'})
    #         for x in ['pred','true']:
    #             S[x] = S[x] * S['mlt']
    #         S.insert(2, 'err', S['pred'] - S['true'])
    #         S.insert(3, 'err%', (S['err'] / S['true']).clip(-1, 1) * 100)
    #         prediction['rslt'] = {'full':Y, 'summary': S.drop(columns='mlt').prep()}
    #         self.pred.append(prediction)
    #         self.dump()
    #     return prediction

# class MM():
#     def __init__(self, func, candidates):
#         assert func in [mean_match_kdtree_classification, default_mean_match]
#         self.func = func
#         self.candidates = candidates
#     def __call__(self, *args, **kwargs):
#         return self.func(*args, **kwargs)
#     def __str__(self):
#         return join([x for x in ['kdtree','default'] if x in self.func.__name__]+[self.candidates], "_")

# class kdtree():
#     def __call__(self, *args, **kwargs):
#         return mean_match_kdtree_classification(*args, **kwargs)
#     def __str__(self):
#         return 'kdtree__mean_match'

# class default():
#     def __call__(self, *args, **kwargs):
#         return default_mean_match(*args, **kwargs)
#     def __str__(self):
#         return 'default_mean_match'


In [None]:
Z = it.product(self.crse, ['n','r','t','all'])
[[crse, styp_code, train_term] for crse, styp_code in Z for train_term in self.term_codes]

In [None]:
Z = it.product(self.crse, ['n','r','t','all'])
list(Z)

In [None]:
R

In [None]:
d = {
    ('a','b'):7,
    ('a','c'):71}
d['a','c']

In [None]:
qry = f"""
select
        A.sfrstcr_term_code,
        A.sfrstcr_pidm,
        B.ssbsect_subj_code,
        B.ssbsect_crse_numb,
        B.ssbsect_credit_hrs,
        A.sfrstcr_credit_hr
from sfrstcr A, ssbsect B
where
        A.sfrstcr_term_code = B.ssbsect_term_code
        and A.sfrstcr_crn = B.ssbsect_crn
        and A.sfrstcr_term_code = 202308
        and A.sfrstcr_ptrm_code not in ('28','R3')
        and  trunc(to_date('18-Sep-23')) - trunc(A.sfrstcr_add_date) >= 197  -- added before cycle_day
        and (trunc(to_date('18-Sep-23')) - trunc(A.sfrstcr_rsts_date) < 197 or A.sfrstcr_rsts_code in ('DC','DL','RD','RE','RW','WD','WF')) -- dropped after cycle_day or still enrolled
        and B.ssbsect_subj_code <> 'INST'
        and A.sfrstcr_credit_hr <> B.ssbsect_credit_hrs
"""
db.head(qry, show=True)

In [None]:
qry = f"styp_code=='n' & pred_term!={self.infer}"
val = "err%"
q=50
P = A['summary'].reset_index().query(qry).pivot_table(columns='train_term', index='pred_term', values=val, aggfunc=pctl(q))
for _ in range(2):
    P = (P.assign(mean=lambda x:x.mean(axis=1)) if P.shape[1] > 1 else P).T
P.assign(**{val:f"{q}%"}).set_index(val, append=True).swaplevel(0,1).round(0).prep().T


In [None]:
kdtree = mean_match_kdtree_classification
kdtree.__name__ = 'a'
setattr(kdtree,'__str__','a')
setattr(kdtree,'__repr__','a')

print(kdtree)

In [None]:
from LiveAMP import *
from miceforest.mean_matching_functions import default_mean_match, mean_match_kdtree_classification
class MM():
    def __init__(self, func, candidates):
        self.func = func
        self.candidates = candidates
    def __call__(self, *args, **kwargs):
        return self.func(*args, **kwargs)
    def __str__(self):
        return join([x for x in ['kdtree','deafult'] if x in self.func.__name__]+[self.candidates], "_")

mm = MM(mean_match_kdtree_classification, 3)
print(mm)
# type(mean_match_kdtree_classification)

In [None]:
mean_match_kdtree_classification.__name__

In [None]:
x = default_mean_match
x.__name__

In [None]:

            # A[styp_code] = {
            #     'proj': pd.concat([pivot(f"styp_code=='{styp_code}' & pred_term=={self.infer}", "pred", q) for q in [25,50,75]], axis=1),
            #     **{stat: pivot(f"styp_code=='{styp_code}' & pred_term!={self.infer}", stat) for stat in ["err","err%","mse%","f1_inv%"]}



    # R = {styp_code: {
    #         'proj': pd.concat([pivot(f"styp_code=='{styp_code}' & pred_term=={self.infer}", "pred", q) for q in [25,50,75]], axis=1),
    #         **{stat: pivot(f"styp_code=='{styp_code}' & pred_term!={self.infer}", stat) for stat in ["err","err%","mse%","f1_inv%"]}
    #     } for styp_code in ["n"]}

        # R['n']['proj'].disp(100)
        # R['n']['err%'].disp(100)
# B = (
#     A['summary']
#     .grpby(['crse','styp_code','train_term','pred_term'])
#     # .grpby(['crse','styp_code','pred_term'])
#     [['pred','err%','mse%','f1_inv%']]
#     .agg(summary)
#     .stack(0, sort=False)
#     .rename_axis(index={None:'kind'})
#     .query(f"(pred_term == {self.infer} and kind == 'pred') or (pred_term != {self.infer} and kind == 'err%')")
#     .reset_index()
#     # .sort_values(['crse','styp_code','pred_term','train_term'],ascending=[True,True,False,False])
#     .prep()
# )
# M = A['summary'].query("pred_term != @self.infer & styp_code=='n'").pivot_table(index='train_term', columns='pred_term', values='err%', margins=True)
# M.disp(10)
# B.disp(10)

In [None]:
len(self.pred)

In [None]:
R['n']['err%'].disp(100)

In [None]:
A['summary']

In [None]:
A = {k: pd.concat([p['rslt'][k] for p in P]) for k in ['full','summary']}
def pivot(qry, val, q=50):
    P = A['summary'].reset_index().query(qry).pivot_table(columns='train_term', index='pred_term', values=val, aggfunc=pctl(q), margins=True, margins_name='mean')
    for _ in range(2):
        P = (P.head(1) if P.shape[0] == 2 else P).T
    return P.assign(**{val:f"{q}%"}).set_index(val, append=True).swaplevel(0,1).round(0).prep().T

R = {styp_code: {
    'proj': pd.concat([pivot(f"styp_code=='{styp_code}' & pred_term=={self.infer}", "pred", q) for q in [25,50,75]], axis=1),
    **{stat: pivot(f"styp_code=='{styp_code}' & pred_term!={self.infer}", stat) for stat in ["err","err%","mse%","f1_inv%"]}
} for styp_code in ["n"]}

R['n']['proj'].disp(100)
R['n']['err%'].disp(100)
# }}
# projections = pd.concat([piv("pred_term == @self.infer & styp_code=='n'", 'pred', q) for q in [25,50,75]], axis=1)
# errors = piv("pred_term != @self.infer & styp_code=='n'", 'err%', 50)
# Q
# M


In [None]:
def g(p):
    f = lambda x: x.quantile(p/100)
    f.__name__ = f'{p}%'
    f.__str__ = f'{p}%'
    f.__repr__ = f'{p}%'
    return f
print(f"{g(25)}")
display(f)
str(f)

In [None]:
f = pctl(50)
f.__repr__ = 'a'
f.__str__ = 'a'
f'{f}'
# print(f)
# f.__qualname__
# print(f)

In [None]:
w = pctl(50)
hasattr(w, '__name__')

In [None]:
x = 'hi'
# x.__name__ = x
hasattr(x, '__name__')

In [None]:
A['summary']

In [None]:
def piv(qry, val, q=50):
    P = A['summary'].reset_index().query(qry).pivot_table(columns='train_term', index='pred_term', values=val, aggfunc=pctl(q), margins=True, margins_name='mean')
    for _ in range(2):
        P = (P.head(1) if P.shape[0] == 2 else P).T
    return P.assign(**{val:f"{q}%"}).set_index(val, append=True).swaplevel(0,1).round(0).prep().T
A = {k: pd.concat([p['rslt'][k] for p in P]) for k in ['full','summary']}
Q = pd.concat([piv("pred_term == @self.infer & styp_code=='n'", 'pred', q) for q in [25,50,75]], axis=1)
M = piv("pred_term != @self.infer & styp_code=='n'", 'err%', 50)
Q
M
# q = Q[0]
# q
# Q[0]
# piv("styp_code=='n'", 'err%')

In [None]:
q.T.assign(a=50).set_index('a', append=True).swaplevel(0,1).T

In [None]:
A = Q[0]
A.rename('a')

In [None]:
B = (
    A['summary']
    .grpby(['crse','styp_code','train_term','pred_term'])
    [['pred','err%','mse%','f1_inv%']]
    .agg(summary)
    .stack(0, sort=False)
    .rename_axis(index={None:'kind'})
    .query(f"(pred_term == {self.infer} and kind == 'pred') or (pred_term != {self.infer} and kind == 'err%')")
    .reset_index()
    # .sort_values(['crse','styp_code','pred_term','train_term'],ascending=[True,True,False,False])
    .prep()
)

B

In [None]:
# M = A['summary'].query("pred_term!=202408 & styp_code=='n'")['err%'].groupby(['train_term','pred_term']).mean().reset_index()#.unstack()
# M.pivot_table(index='train_term',columns='pred_term', margins=True)

# M
A['summary'].reset_index().query("pred_term!=202408 & styp_code=='n'").pivot_table(index='train_term', columns='pred_term', values='err%', margins=True)

In [None]:
M.disp(10)
B.disp(10)

In [None]:
A['summary']

In [None]:
t = self.Z.vc('term_code')
v = t.values
pd.DataFrame((v / v.T - 1) * 100, index=t.index, columns=t.index).round().prep(0)

In [None]:
B.sort_values(['train_term','pred_term'], ascending=False).reset_index()

In [None]:
M.disp(10)
B.disp(10)

In [None]:
B

In [None]:
M

In [None]:
M = A['summary'].query("pred_term!=202408 & styp_code=='n' & pred_term!=train_term")['err%'].groupby(['train_term','pred_term']).mean().unstack()
M

In [None]:
A['summary'].disp(500)

In [None]:
where = lambda x: x.query("levl_code == 'ug' and styp_code in ('n','r','t')").copy()

with warnings.catch_warnings(action='ignore'):
    self.Y = [pd.concat([term.reg[k] for term in self.term.values()]).assign(credit_hr=lambda x:x['credit_hr'].fillna(0)>0) for k in [0,1]]
    # self.Z = trf.fit_transform(where(self.X).set_index(self.attr, drop=False))
# agg = lambda y, g: y.groupby(g)[['credit_hr']].sum()
# grp = ['styp_code','term_code','crse']
# end = agg(where(self.Y[0]), grp)
# self.Y = [self.Z[[]].join(y.set_index(['pidm','term_code'])[['crse','credit_hr']], how='inner') for y in self.Y]
# cur = agg(self.Y[0], grp)
# M = (end / cur).query("term_code != @self.infer")
# N = M.reset_index().assign(term_code=self.infer).set_index(M.index.names)
# self.mlt = pd.concat([M, N], axis=0).replace(np.inf, pd.NA).squeeze().rename('mlt').prep()
# return self.dump()

agg = lambda y, g: where(y).groupby(g)[['credit_hr']].sum()
grp = ['levl_code','styp_code','term_code','crse']
end = agg(self.Y[0], grp)
self.Y = [self.Z[[]].join(y.set_index(['pidm','term_code'])[['crse','credit_hr']], how='inner') for y in self.Y]
cur = agg(self.Y[0], grp)
M = (end / cur).query("term_code != @self.infer")
M
# agg(self.Y[0], grp).disp(500)