In [1]:
import os
import pandas as pd
import numpy as np
import random
import operator
import functools
import pickle
import json
import re

In [2]:
from pyod.models import cblof
from keras import losses
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans

In [3]:
import pyreadr

# read in and subset to region (nb)
dat_dir = 'I:/NCES/NCES_Dev/sunjoo_LEE_MOVE/CRDC_outlier_2021_2/0_processed_data/'
the_data_file = '{}crdc_prepped_formod.csv'.format(dat_dir)
df = pd.read_csv(the_data_file)
df.fillna(0, inplace=True)

In [4]:
# module: columns in module
with open('{}mod_col.txt'.format(dat_dir), 'rb') as handle:
    new_mod_dict = json.load(handle)
cols_to_run = [item for sublist in new_mod_dict.values() for item in sublist]    

In [5]:
df = df[cols_to_run]

In [6]:
# classifiers
clfs = {'CBLOF': cblof.CBLOF()}

# params only for cblof
param_grid = {'CBLOF': {'n_clusters': [20],
                        'contamination': [0.01],
                        'clustering_estimator': [KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=1000,
                                                        n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
                                                        random_state=None, tol=0.0001, verbose=0)],
                        'alpha': [0.9],
                        'beta': [5],
                        'use_weights': [False],
                        'check_estimator': [False]}}

In [7]:
# create list of params to go through and update during modeling -- this has all the specifications for each model!
all_clf_mods = []
clf_name = 'CBLOF'
parameter_values = param_grid[clf_name] #creates a set of params for each combination in hyper-param lists
for p in ParameterGrid(parameter_values):
    seed = random.randint(0, 1000000)
    clf_param_id = clf_name + '_' + str(seed)
    clf_param_info = {clf_param_id: {}} #create dictionary to keep track of everything about the clf and particular params
    clf_param_info[clf_param_id]['params'] = p
    clf_param_info[clf_param_id]['clf'] = clf_name
    clf_param_info[clf_param_id]['seed'] = seed
    clf_param_info[clf_param_id]['modules_done'] = []
    clf_param_info[clf_param_id]['ten_fold_done'] = 0
    all_clf_mods.append(clf_param_info)


mod_dir = 'I:/NCES/NCES_Dev/sunjoo_LEE_MOVE/CRDC_outlier_2021_2/2_modeling/'

if not os.path.isdir('{}models/'.format(mod_dir)):
    os.mkdir('{}models/'.format(mod_dir))
    print('Made models dir')

if not os.path.isdir('{}results/'.format(mod_dir)):
    os.mkdir('{}results/'.format(mod_dir))    
    print('Made results dir')

# this is to prevent accidentally overwriting models list -- manually go and delete it first if you want to replace it
if not os.path.isfile('{}models/all_clf_mods.pickle'.format(mod_dir)):    
    with open('{}models/all_clf_mods.pickle'.format(mod_dir), 'wb') as handle:
        pickle.dump(all_clf_mods, handle)
    print('Saved all model specs')

Saved all model specs


In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# do not transform ratios and indicators
cols = df.columns
all_transform_cols = [col for col in cols if '_ind' not in col and '_ratio' not in col and '_mean' not in col and col!='pov_per_5-17']

In [15]:
# weights for some variables
hs_15 = ['sch_grade_09', 'sch_grade_10', 'sch_grade_11', 'sch_grade_12', 'sch_ugdetail_hs'] #each 1.15 weight
stat_20 = ['sch_status_sped','sch_status_charter','sch_status_magnet', 'sch_altfocus_pre_mean'] #each 1.2 weight
imp_25 = ['sch_grade_ps','sch_altfocus_post_mean','tot_enrl','pov_per_5-17'] #each 1.25 weight
d1 = {key:1.15 for key in hs_15}
d2 = {key:1.2 for key in stat_20}
d3 = {key:1.25 for key in imp_25}
col_weights = {**d1, **d2, **d3}

In [10]:
# run, baby, run
for clf_i, mod_dict in enumerate(all_clf_mods): #this is a list of dictionaries
    for clf_param_id, info_dict in mod_dict.items(): #there is only one
       
        print('')
        print(clf_param_id)
        clf_name = info_dict['clf'] #COPOD
        p = info_dict['params'] #dict of params
        clf = clfs[clf_name] #base classifier
        # set params for classifier
        clf.set_params(**p)
        if clf_name in ['CBLOF','IForest','AUTO_ENC']:
            clf.set_params(random_state=info_dict['seed']) #use same seed
            if clf_name == 'CBLOF':
                #set params for KMeans in CBLOF (doesn't seem to happen automatically)
                clf.clustering_estimator.set_params(**{'n_clusters': info_dict['params']['n_clusters'], 'random_state':info_dict['seed']})	

            # divide into 10
            random.seed(info_dict['seed'])
            samp = [random.randint(1,10) for x in range(df.shape[0])]
            
            df_scores = pd.DataFrame()
            df_scores['score'] = [0] * df.shape[0]

            # already done i
            done_i = info_dict['ten_fold_done']
            for i in range(done_i+1, 11):
                # divide into training and testing
                idx = [ii for ii,e in enumerate(samp) if e!=i]
                idx_t = [ii for ii in range(df.shape[0]) if ii not in idx]

                # get cols to transform
                transform_cols = [col for col in df.columns if col in all_transform_cols]
                col_transformer = ColumnTransformer(
                    transformers=[('ss', StandardScaler(), transform_cols)],
                    remainder='passthrough',
                    transformer_weights=col_weights
                    )

                # train on 9/10, fit on 1/10
                X_train = df.iloc[idx,:]
                X_train_transformed = col_transformer.fit_transform(X_train)
                del X_train
                X_test = df.iloc[idx_t,:]
                X_test_transformed = col_transformer.transform(X_test)
                del X_test
                # train
                clf.fit(X_train_transformed)

                # get outlier scores for last tenth of data
                y_test_scores = clf.decision_function(X_test_transformed)  # outlier scores
                df_scores.iloc[idx_t, 0] = y_test_scores
                print(i)



CBLOF_575580
1
2
3
4
5
6
7
8
9
10


In [11]:
df_scores.to_csv('{}results/df_scores.csv'.format(mod_dir))

### Nearest neighbors

In [6]:
geo = pd.read_csv('{}ocr_region.csv'.format(dat_dir))
raw = pd.read_csv('{}crdc_prepped.csv'.format(dat_dir), dtype={'combokey':str, 'leaid':str})
print(raw.shape)

(97564, 828)


In [7]:
geo['state'] = geo['state'].map(lambda x: x.lower())

In [8]:
raw = pd.merge(raw, geo, how='left', on='state')
df = pd.concat([df, raw['region']], axis=1)
print(raw.shape)

(97564, 829)


In [12]:
df = df[df['region'].notna()]
del raw
print(df.shape)

(97564, 825)


In [18]:
from sklearn import neighbors

# num neighbors
N = 20

regs = ["Atlanta", "Seattle", "Denver", "Kansas City", "San Fransisco", "Boston", "Philadelphia", "Chicago", "Dallas", "Cleveland", "New York", "DC"]
for reg in regs:
    d = df[df['region'] == reg]
    del d['region'] 
    # make transformer
    transform_cols = [col for col in d.columns if col in all_transform_cols]
    col_transformer = ColumnTransformer(
                        transformers=[('ss', StandardScaler(), transform_cols)],
                        remainder='passthrough',
                        transformer_weights=col_weights
                        )
    # fit knn, get top 100 for each
    X_train_transformed = col_transformer.fit_transform(d)
    neigh = neighbors.NearestNeighbors(n_neighbors=N, n_jobs=-1)
    neigh.fit(X_train_transformed)
    neighs = neigh.kneighbors()
    d = pd.DataFrame(neighs[1])
    d.to_csv('{}results/neigh_{}.csv'.format(mod_dir, reg), index=False)