In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
## Fixing the module import problem

import sys
# cwd = '/home/ec2-user/SageMaker/imputation-fairness/data/Adult/code'
cwd = '../../../core'
sys.path.append(cwd)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
import matplotlib.patches as mpatches

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler

from missing_module import * 

### Set Pandas options to display all rows and columns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

### Supress warnings
import warnings; warnings.simplefilter('ignore')


In [3]:
student_pets = pd.read_csv("../data/hsls_17_student_pets_sr_v1_0.csv")
school = pd.read_csv("../data/hsls_09_school_v1_0.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../data/hsls_17_student_pets_sr_v1_0.csv'

In [5]:
student_pets.shape

(23503, 9614)

In [6]:
school.shape

(944, 687)

In [4]:
student_vars = ['X1SEX', 'X1RACE', 'X1MTHID', 'X1MTHUTI', 'X1MTHEFF', 'X1MTHINT','X1PAR1EDU', 'X1PAR2EDU', 
                'X1PAR1OCC2', 'X1PAR2OCC2', 'X1FAMINCOME',
                'S1ENGCOMP', 'S1MTHCOMP', 'S1SCICOMP', 'S1APCALC', 'S1IBCALC']

parent_vars = ['P1JOBNOW1', 'P1JOBONET1_STEM1', 'P1JOBONET2_STEM1','P1MTHHWEFF', 'P1SCIHWEFF', 'P1ENGHWEFF', 
               'P1MTHCOMP', 'P1SCICOMP', 'P1ENGCOMP', 'P1CAMPMS', 'P1CAMPOTH', 'P1NOOUTSCH', 'P1MUSEUM', 
               'P1COMPUTER', 'P1FIXED', 'P1SCIFAIR', 'P1SCIPROJ', 'P1STEMDISC', 'P1EDUASPIRE', 'P1EDUEXPECT']

grade_vars = ['X1TXMSCR', 'X2TXMSCR']

all_vars = grade_vars+student_vars+parent_vars

In [5]:
len(student_vars)

16

<br/>

## Preprocessing Steps (Race)

In [8]:
## Selecting relevant features ##
df = student_pets[all_vars]
df[df <= -7] = np.nan

## Dropping rows that are missing race or sex ##
df = df[df['X1RACE'].notna() & df['X1SEX'].notna() & df['X1TXMSCR'].notna()]

## Creating racebin & gradebin variable ## 
df['racebin'] = (df['X1RACE']==8).astype(int)
df['gradebin'] = (df['X1TXMSCR'] > df['X1TXMSCR'].median()).astype(int)

## Dropping race and 12th grade data just to focus on the 9th grade prediction ##
df = df.drop(columns=['X1RACE', 'X1TXMSCR', 'X2TXMSCR'])

## Scaling ##
scaler = MinMaxScaler() 
df = pd.DataFrame(scaler.fit_transform(df) , columns=df.columns,index=df.index)

## Balancing data to have roughly equal race=0 and race =1 ##
df = balance_data(df, 'racebin')

df.describe()

Unnamed: 0,X1SEX,X1MTHID,X1MTHUTI,X1MTHEFF,X1MTHINT,X1PAR1EDU,X1PAR2EDU,X1PAR1OCC2,X1PAR2OCC2,X1FAMINCOME,S1ENGCOMP,S1MTHCOMP,S1SCICOMP,S1APCALC,S1IBCALC,P1JOBNOW1,P1JOBONET1_STEM1,P1JOBONET2_STEM1,P1MTHHWEFF,P1SCIHWEFF,P1ENGHWEFF,P1MTHCOMP,P1SCICOMP,P1ENGCOMP,P1CAMPMS,P1CAMPOTH,P1NOOUTSCH,P1MUSEUM,P1COMPUTER,P1FIXED,P1SCIFAIR,P1SCIPROJ,P1STEMDISC,P1EDUASPIRE,P1EDUEXPECT,racebin,gradebin
count,19180.0,18924.0,16757.0,16715.0,16403.0,14654.0,11441.0,14145.0,11056.0,14654.0,18541.0,18521.0,18463.0,18227.0,17882.0,13981.0,13909.0,10792.0,13662.0,13637.0,13647.0,12901.0,12877.0,12880.0,13422.0,13422.0,13422.0,13492.0,13492.0,13492.0,13492.0,13492.0,13492.0,13643.0,13684.0,19180.0,19180.0
mean,0.492857,0.50739,0.726989,0.652691,0.551132,0.331036,0.322583,0.464851,0.505275,0.298405,0.369074,0.489552,0.509979,0.476162,0.658465,0.721551,0.059801,0.049553,0.477529,0.372553,0.288525,0.558193,0.540654,0.350427,0.04217,0.261809,0.126807,0.532315,0.858361,0.450341,0.179884,0.398755,0.659057,0.801015,0.637942,0.5,0.496559
std,0.499962,0.287366,0.207121,0.219412,0.218443,0.230181,0.248316,0.303299,0.347508,0.252946,0.21767,0.23031,0.215575,0.347214,0.332682,0.448252,0.157915,0.144791,0.39054,0.342972,0.335684,0.204574,0.168964,0.198086,0.200983,0.439636,0.332769,0.498973,0.348693,0.497546,0.384105,0.48966,0.474044,0.250841,0.261322,0.500013,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.332378,0.582988,0.519824,0.396476,0.166667,0.166667,0.227273,0.136364,0.083333,0.25,0.5,0.5,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.75,0.5,0.0,0.0
50%,0.0,0.504298,0.701245,0.665198,0.5837,0.333333,0.166667,0.454545,0.590909,0.25,0.5,0.5,0.5,0.666667,0.666667,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.7,0.5,0.0
75%,1.0,0.667622,0.914938,0.764317,0.687225,0.5,0.5,0.727273,0.818182,0.416667,0.5,0.5,0.5,0.666667,1.0,1.0,0.0,0.0,1.0,0.5,0.5,0.75,0.5,0.5,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.9,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


<br/>

## Preprocessing Steps (Sex)

* `X1SEX`: 1 -- Male, 2 -- Female -> Preprocess it to: 0 -- Female, 1 -- Male

In [18]:
## Selecting relevant features ##
df = student_pets[all_vars]
df[df <= -7] = np.nan

## Dropping rows that are missing race or sex ##
df = df[df['X1RACE'].notna() & df['X1SEX'].notna() & df['X1TXMSCR'].notna()]


## Creating sexbin & gradebin variable (9thgrade) ## 
df['sexbin'] = (df['X1SEX']==1).astype(int)
df['gradebin'] = (df['X1TXMSCR'] > df['X1TXMSCR'].median()).astype(int)

## Dropping sex and 12th grade data just to focus on the 9th grade prediction ##
df = df.drop(columns=['X1SEX', 'X1TXMSCR', 'X2TXMSCR'])

## Scaling ##
scaler = MinMaxScaler() 
df = pd.DataFrame(scaler.fit_transform(df) , columns=df.columns,index=df.index)

## Balancing data to have roughly equal race=0 and race =1 ##
df = balance_data(df, 'sexbin')

df.describe()

Unnamed: 0,X1RACE,X1MTHID,X1MTHUTI,X1MTHEFF,X1MTHINT,X1PAR1EDU,X1PAR2EDU,X1PAR1OCC2,X1PAR2OCC2,X1FAMINCOME,S1ENGCOMP,S1MTHCOMP,S1SCICOMP,S1APCALC,S1IBCALC,P1JOBNOW1,P1JOBONET1_STEM1,P1JOBONET2_STEM1,P1MTHHWEFF,P1SCIHWEFF,P1ENGHWEFF,P1MTHCOMP,P1SCICOMP,P1ENGCOMP,P1CAMPMS,P1CAMPOTH,P1NOOUTSCH,P1MUSEUM,P1COMPUTER,P1FIXED,P1SCIFAIR,P1SCIPROJ,P1STEMDISC,P1EDUASPIRE,P1EDUEXPECT,sexbin,gradebin
count,21114.0,20835.0,18524.0,18478.0,18123.0,16186.0,12696.0,15672.0,12303.0,16186.0,20425.0,20406.0,20344.0,20090.0,19714.0,15443.0,15416.0,12018.0,15101.0,15076.0,15083.0,14256.0,14231.0,14227.0,14829.0,14829.0,14829.0,14905.0,14905.0,14905.0,14905.0,14905.0,14905.0,15077.0,15124.0,21114.0,21114.0
mean,0.753833,0.506687,0.724827,0.652347,0.549939,0.333663,0.326533,0.46117,0.500103,0.303467,0.368348,0.489525,0.510052,0.477949,0.660427,0.725442,0.05988,0.049491,0.479637,0.371551,0.284725,0.55873,0.541283,0.350179,0.040866,0.267921,0.123879,0.536598,0.862865,0.450252,0.176048,0.394498,0.660718,0.798435,0.635077,0.5,0.500095
std,0.30965,0.287474,0.206867,0.219082,0.218394,0.228838,0.248195,0.303113,0.348404,0.25401,0.216759,0.229229,0.214467,0.346621,0.331494,0.446306,0.157889,0.14438,0.390405,0.34052,0.332708,0.202148,0.166792,0.196318,0.197986,0.442891,0.329454,0.498675,0.344002,0.497536,0.380874,0.488759,0.473482,0.25082,0.260517,0.500012,0.500012
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.571429,0.332378,0.582988,0.519824,0.396476,0.166667,0.166667,0.227273,0.136364,0.083333,0.25,0.5,0.5,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.75,0.5,0.0,0.0
50%,1.0,0.504298,0.665975,0.665198,0.579295,0.333333,0.166667,0.454545,0.545455,0.25,0.5,0.5,0.5,0.666667,0.666667,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.7,0.5,1.0
75%,1.0,0.667622,0.914938,0.764317,0.687225,0.5,0.5,0.727273,0.818182,0.416667,0.5,0.5,0.5,0.666667,1.0,1.0,0.0,0.0,1.0,0.5,0.5,0.75,0.5,0.5,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.9,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
## Pickling into a format used for fair ensemble tree ##
df_ms_to_pickle(df, sens_attr='sexbin', filename='hsls_missing_sex.pkl')