#### This notebook runs the r2 feature selection function to determine the best features to use in the clustering models. I use the 2015-2016 SQR data to inform feature selection in order to prevent overfitting the data.  

In [1]:
import pandas as pd
import numpy as np
from r2FeatureSelection import r2FeatureSelection
from sklearn.preprocessing import Imputer

##### For this first set of analyses, I chose to drop na values on the target variables rather than impute a mean or median value across missing records. Schools with cohort sizes less than 20 students are not reported on in NYC data, so imputing values for these schools would overestimate the size of the effect between the features and various targets. 

In [20]:
data = pd.read_csv("data/sqrAnalysisData.csv")
sy = data[data["schoolYear"]=='2015_2016'].copy()
syFeatureNames = ['averageGrade8EnglishProficiency','averageGrade8MathProficiency',
    'percentEnglishLanguageLearners','percentStudentswithDisabilities','percentSelfContained',
    'economicNeedIndex','percentinTempHousing','percentHRAEligible','percentAsian',
    'percentBlack','percentHispanic','percentWhite']
naDropSy = sy.dropna(subset = ['4YearGraduationRate','collegeandCareerPreparatoryCourseIndex',
 'metricCityRating4YearGraduationRateBlackorHispanicMalesinLowestThirdCitywide',
 '4YearGraduationRateEnglishLanguageLearners'])

In [22]:
imp = Imputer(missing_values='NaN', strategy='mean')
syFeatures = imp.fit_transform(naDropSy.loc[:,[
    'averageGrade8EnglishProficiency','averageGrade8MathProficiency',
    'percentEnglishLanguageLearners','percentStudentswithDisabilities','percentSelfContained',
    'economicNeedIndex','percentinTempHousing','percentHRAEligible','percentAsian',
    'percentBlack','percentHispanic','percentWhite']])

In [25]:
gradRate = naDropSy.loc[:,'4YearGraduationRate']
ccpci = naDropSy.loc[:,'collegeandCareerPreparatoryCourseIndex']
bhLowestThirdGradRate = naDropSy.loc[:, 'metricCityRating4YearGraduationRateBlackorHispanicMalesinLowestThirdCitywide']
ellGradRate = naDropSy.loc[:, '4YearGraduationRateEnglishLanguageLearners']

In [26]:
gradModel = r2FeatureSelection(data=syFeatures,target=gradRate,featureNames=syFeatureNames) 
ccpciModel = r2FeatureSelection(data=syFeatures,target=ccpci,featureNames=syFeatureNames) 
bhModel = r2FeatureSelection(data=syFeatures,target=bhLowestThirdGradRate,featureNames=syFeatureNames) 
ellModel = r2FeatureSelection(data=syFeatures,target=ellGradRate,featureNames=syFeatureNames) 

In [None]:
## To Do: create a function to drop only missing values for each target