In [1]:
import os
import pandas as pd
import numpy as np
import random
import operator
import functools
import json
import pickle
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# data
dat_dir = 'I:/NCES/NCES_Dev/sunjoo_LEE_MOVE/CRDC_outlier_2021_2/0_processed_data/'
the_data_file = '{}crdc_prepped_formod.csv'.format(dat_dir)
dfa = pd.read_csv(the_data_file, dtype={'combokey':str, 'leaid':str})

In [3]:
# module: columns in module
with open('{}mod_col.txt'.format(dat_dir), 'rb') as handle:
    new_mod_dict = json.load(handle)

In [4]:
from pyod.models import cblof
from keras import losses
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans

In [5]:
# read in param_grid_all
import param_raw

param_grid = {}

# classifiers
clfs = {'CBLOF': cblof.CBLOF()}

# only these classifiers in this notebook
clf_this_nb = ['CBLOF']
for c in clf_this_nb:
    param_grid[c] = param_raw.param_grid_all[c]

In [6]:
# create list of params to go through and update during modeling -- this has all the specifications for each model!
all_clf_mods = []
for clf_name in clf_this_nb:
    parameter_values = param_grid[clf_name] #creates a set of params for each combination in hyper-param lists
    for p in ParameterGrid(parameter_values):
        seed = random.randint(0, 1000000)
        clf_param_id = clf_name + '_' + str(seed)
        clf_param_info = {clf_param_id: {}} #create dictionary to keep track of everything about the clf and particular params
        clf_param_info[clf_param_id]['params'] = p
        clf_param_info[clf_param_id]['clf'] = clf_name
        clf_param_info[clf_param_id]['seed'] = seed
        clf_param_info[clf_param_id]['modules_done'] = []
        clf_param_info[clf_param_id]['ten_fold_done'] = 0
        all_clf_mods.append(clf_param_info)

In [7]:
mod_dir = 'I:/NCES/NCES_Dev/sunjoo_LEE_MOVE/CRDC_outlier_2021_2/2_modeling/'

if not os.path.isdir('{}models/'.format(mod_dir)):
    os.mkdir('{}models/'.format(mod_dir))
    print('Made models dir')

if not os.path.isdir('{}results/'.format(mod_dir)):
    os.mkdir('{}results/'.format(mod_dir))    
    print('Made results dir')

# this is to prevent accidentally overwriting models list -- manually go and delete it first if you want to replace it
if not os.path.isfile('{}models/all_clf_mods_bymod.pickle'.format(mod_dir)):    
    with open('{}models/all_clf_mods_bymod.pickle'.format(mod_dir), 'wb') as handle:
        pickle.dump(all_clf_mods, handle)
    print('Saved all model specs')

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# do not transform ratios and indicators
cols = dfa.columns
all_transform_cols = [col for col in cols if '_ind' not in col and '_ratio' not in col and '_mean' not in col and col!='pov_per_5-17']

In [9]:
# weights for some variables
hs_15 = ['sch_grade_09', 'sch_grade_10', 'sch_grade_11', 'sch_grade_12', 'sch_ugdetail_hs'] #each 1.15 weight
stat_20 = ['sch_status_sped','sch_status_charter','sch_status_magnet', 'sch_altfocus_pre_mean'] #each 1.2 weight
imp_25 = ['sch_grade_ps','sch_altfocus_post_mean','tot_enrl','pov_per_5-17'] #each 1.25 weight
d1 = {key:1.15 for key in hs_15}
d2 = {key:1.2 for key in stat_20}
d3 = {key:1.25 for key in imp_25}
col_weights = {**d1, **d2, **d3}

In [None]:
# reopen to pick up where you left off
with open('{}models/all_clf_mods_bymod.pickle'.format(mod_dir), 'rb') as handle:
    all_clf_mods = pickle.load(handle)
    
# run, baby, run
for clf_i, mod_dict in enumerate(all_clf_mods): #this is a list of dictionaries
    for clf_param_id, info_dict in mod_dict.items(): #there is only one
        if len(info_dict['modules_done']) == len(new_mod_dict): #this classifier with params is already done
            continue
            
        else:
            print('')
            print(clf_param_id)
            clf_name = info_dict['clf'] #COPOD
            p = info_dict['params'] #dict of params
            clf = clfs[clf_name] #base classifier
            # set params for classifier
            clf.set_params(**p)
            if clf_name in ['CBLOF','IForest','AUTO_ENC']:
                clf.set_params(random_state=info_dict['seed']) #use same seed
                if clf_name == 'CBLOF':	
                    #set params for KMeans in CBLOF (doesn't seem to happen automatically)	
                    clf.clustering_estimator.set_params(**{'n_clusters': info_dict['params']['n_clusters'], 'random_state':info_dict['seed']})	

            
            # check if this is a new model with no runs -- make new df_scores
            if (len(info_dict['modules_done']) == 0) and (info_dict['ten_fold_done'] == 0):
                df_scores = pd.DataFrame()
                print('Made new df_scores for', clf_param_id)

            # otherwise add to old dataframe
            else:
                df_scores = pd.read_csv('{}results/df_scores_{}.csv'.format(mod_dir, clf_param_id))
            
            # subset to only modules not run yet
            modules_to_run = [m for m in new_mod_dict.keys() if m not in info_dict['modules_done']]
            
            for mod in modules_to_run:
                print('')
                print('NO', mod)
                
                # read-in data
                dfa = pd.read_csv(the_data_file, dtype={'combokey':str, 'leaid':str})
                ids = dfa.iloc[:,:2] # index, school and lea id
                dfa = dfa.iloc[:,2:] # just data
                new_col = dfa.columns # all cols in new data (aggregated cols)

                # if first of 10, initialize all rows to 0
                if info_dict['ten_fold_done'] == 0:
                    df_scores['NO-'+ mod] = [0] * dfa.shape[0]
                    print('Initialized ', clf_param_id, 'NO', mod, 'with 0s')
                
                # take out columns for one module
                cols_in_mod = new_mod_dict[mod]
                cols_to_run = [c for c in dfa.columns if c not in cols_in_mod]
                
                df = dfa[cols_to_run]
                print('Cols to run length:', len(cols_to_run))
                del dfa

                # divide into 10
                random.seed(info_dict['seed'])
                samp = [random.randint(1,10) for x in range(df.shape[0])]
                
                # already done i
                done_i = info_dict['ten_fold_done']
                for i in range(done_i+1, 11):
                    print('Running', clf_param_id, 'NO', mod, i)
                    # divide into training and testing
                    idx = [ii for ii,e in enumerate(samp) if e!=i]
                    idx_t = [ii for ii in range(df.shape[0]) if ii not in idx]
                    
                    # get cols to transform
                    transform_cols = [col for col in df.columns if col in all_transform_cols]
                    col_transformer = ColumnTransformer(
                        transformers=[('ss', StandardScaler(), transform_cols)],
                        remainder='passthrough',
                        transformer_weights=col_weights
                        )
                    
                    # train on 9/10, fit on 1/10
                    X_train = df.iloc[idx,:]
                    X_train_transformed = col_transformer.fit_transform(X_train)
                    del X_train
                    X_test = df.iloc[idx_t,:]
                    X_test_transformed = col_transformer.transform(X_test)
                    del X_test
                    # train
                    clf.fit(X_train_transformed)

                    # get outlier scores for last tenth of data
                    y_test_scores = clf.decision_function(X_test_transformed)  # outlier scores
                    df_scores.iloc[idx_t, df_scores.shape[1]-1] = y_test_scores

                    # save outlier scores
                    df_scores.to_csv('{}results/df_scores_{}.csv'.format(mod_dir, clf_param_id), index=False)

                    # save modeling done - i
                    info_dict['ten_fold_done'] = i
                    
                    if clf_name == 'CBLOF':
                        print('Cluster sizes:', list(clf.cluster_sizes_))
                        
                        if 'inertia' in info_dict.keys():
                            info_dict['inertia'].append(clf.clustering_estimator_.inertia_)
                        else:
                            info_dict['inertia'] = [clf.clustering_estimator_.inertia_]
                    
                    if i == 10:
                        # save modeling done - mod
                        info_dict['modules_done'].append(mod)
                        info_dict['ten_fold_done'] = 0

                    
                    all_clf_mods[clf_i] = {clf_param_id: info_dict}
                    with open('{}models/all_clf_mods_bymod.pickle'.format(mod_dir),'wb') as handle:
                        pickle.dump(all_clf_mods, handle)


CBLOF_50035

NO PSCH
Initialized  CBLOF_50035 NO PSCH with 0s
Cols to run length: 815
Running CBLOF_50035 NO PSCH 1
Cluster sizes: [3588, 523, 43822, 865, 1923, 97, 4322, 4, 389, 13, 3, 261, 2350, 298, 60, 23102, 11, 511, 172, 5544]
Running CBLOF_50035 NO PSCH 2
Cluster sizes: [18138, 16, 3, 509, 90, 994, 37775, 2, 95, 2100, 239, 6108, 7589, 6, 237, 983, 3313, 399, 8132, 1147]
Running CBLOF_50035 NO PSCH 3
Cluster sizes: [7355, 2570, 189, 22287, 151, 3637, 43463, 3, 289, 129, 42, 2603, 1110, 1471, 258, 254, 1, 257, 18, 1790]
Running CBLOF_50035 NO PSCH 4
