In [1]:
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
def import_data(file_path, cols, index_col="STU_ID", low_memory=False):
    
    df = pd.read_csv(file_path, index_col=index_col, usecols=cols, low_memory=low_memory)
    
    return df

## Import Data

In [3]:
els_file_path = os.path.join("data", "ELS-2002", "els_02_12_byf3pststu_v1_0.csv")
# nels_file_path = os.path.join("data", "NELS-1988", "nels_88_00_byf4stu_v1_0.csv")

# only keep these columns
cols = [
    "STU_ID", # index
    "BYSEX", # parameters to input into model start
    "BYRACE",
    "BYSTLANG",
    "BYPARED",
    "BYINCOME",
    "BYURBAN",
    "BYREGION",
    "BYRISKFC",
    "BYHMWRK",
    "BYWRKHRS",
    "BYS42",
    "BYS43",
    "BYTVVIGM",
    "BYS46B",
    "BYS44C",
    "BYS20E",
    "BYS87C",
    "BYS20D",
    "BYS23C",
    "BYS37",
    "BYS27I",
    "BYS90D",
    "BYS38A",
    "BYS20J",
    "BYS24C",
    "BYS24D",
    "BYS54I",
    "BYS84D",
    "BYS84I",
    "BYS85A", # parameters to input into model end
    "F2HSSTAT", # parameters to predict start
    "F2EVERDO",
    "F1RGPP2" # parameters to predict end
    ]

df = import_data(els_file_path, cols)
cols.remove("STU_ID")
df = df[cols]
df

Unnamed: 0_level_0,BYSEX,BYRACE,BYSTLANG,BYPARED,BYINCOME,BYURBAN,BYREGION,BYRISKFC,BYHMWRK,BYWRKHRS,...,BYS20J,BYS24C,BYS24D,BYS54I,BYS84D,BYS84I,BYS85A,F2HSSTAT,F2EVERDO,F1RGPP2
STU_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101101,2,5,1,5,10,1,1,2,7,6,...,3,3,1,3,1,1,4,1,0,2
101102,2,2,0,5,11,1,1,0,5,0,...,3,2,1,2,1,1,4,1,0,4
101104,2,7,1,2,10,1,1,-9,-9,0,...,-9,2,1,2,1,1,4,1,0,4
101105,2,3,1,2,2,1,1,-4,11,0,...,-9,3,1,1,1,1,4,1,0,4
101106,2,4,0,1,6,1,1,2,10,3,...,3,2,1,3,1,1,4,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461230,2,4,0,2,9,1,3,-4,4,7,...,3,2,1,3,0,1,1,3,0,2
461231,2,4,0,3,3,1,3,5,6,0,...,4,3,1,3,0,1,3,8,1,3
461232,2,5,0,1,5,1,3,3,4,0,...,3,3,1,2,1,1,4,8,1,2
461233,2,4,0,1,6,1,3,4,25,4,...,3,3,1,2,0,1,3,3,0,5


# Clean data

### Anyone meeting the following criteria is considered a 1 (i.e., will graduate high school or earn equivalent)
* high school graduate
* enrolled in high school or working towards GED
* received GED or certificate of attendance

### Anyone meeting the following criteria is considered a 0 (i.e., will not graduate high school or earn equivalent)
* no diploma, not in high school, not pursuing GED
* status undetermined

In [4]:
df = df\
    .assign(F2HSSTAT=lambda df: df["F2HSSTAT"].replace(to_replace=[1, 2, 3, 4, 5, 6, 7, 8], value=1))\
    .assign(F2HSSTAT=lambda df: df["F2HSSTAT"].replace(to_replace=[9, 10], value=0))

### Convert all missing, unknown, skipped, nonrespondent data to value of -1. These values will be imputed using scikitlearn.

In [5]:
for col in df:
    df.loc[(df[col] < 0) | ((df[col] >= 97) & (df[col] <= 99)), col] = -1

In [6]:
df.to_csv("clean_student_data.csv")