In [11]:
import pandas as pd
import numpy as np
import sys

sys.path.append('data/processing')

from utilities import get_variables

import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor


from util.cleaning import agg_race_cah, agg_hispanicity, filter_na_response, has_bachelors, get_race, get_race_ta17, is_hispanic_cah, get_env_type, get_region, live_w_both_parents, is_race

In [12]:
from os import environ
from sqlalchemy import create_engine

uri = "postgres+psycopg2://zhou@localhost:5432/psid"
engine = create_engine(uri, echo=False)

```sql
SELECT * from ind17 
LEFT JOIN child02 on child02.indid01=ind17.indid01 AND child02.famid01=ind17.famid01 
LEFT JOIN assess ON assess.indid01 = ind17.indid01 AND assess.famid01=ind17.famid01 
LEFT JOIN demog ON demog.indid01 = ind17.indid01 AND demog.famid01=ind17.famid01 
LEFT JOIN fam01 ON fam01.indid01 = ind17.indid01 AND fam01.famid68=ind17.famid68
WHERE ind17.indid01<>0 AND ind17.cds_interview=1
```

In [13]:
sql_get_join_data = "SELECT * FROM ind17 \
LEFT JOIN child02 on child02.indid01 = ind17.indid01 AND child02.famid01 = ind17.famid01 \
LEFT JOIN assess ON assess.indid01 = ind17.indid01 AND assess.famid01 = ind17.famid01 \
LEFT JOIN demog ON demog.indid01 = ind17.indid01 AND demog.famid01 = ind17.famid01 \
LEFT JOIN fam01 ON fam01.famid01 = ind17.famid01 AND fam01.famid68 = ind17.famid68 \
LEFT JOIN ta17 ON ta17.famid17 = ind17.famid17 AND ta17.indid17 = ind17.indid17 \
LEFT JOIN fam_map ON fam_map.famid68 = ind17.famid68 AND fam_map.indid68 = ind17.indid68 \
LEFT JOIN pcg02 ON pcg02.famid01 = ind17.famid01 AND pcg02.indid01 = ind17.indid01 \
LEFT JOIN wlth01 ON wlth01.famid01 = ind17.famid01 \
WHERE ind17.indid01<>0 AND ind17.cds_interview=1"

prelim_data_df = pd.read_sql_query(sql_get_join_data, con=engine)

In [14]:
prelim_data_df = prelim_data_df.loc[:,~prelim_data_df.columns.duplicated()]

In [15]:
get_cah = "SELECT * FROM cah"
cah_data_df = pd.read_sql_query(get_cah, con=engine)
# aggregate properly
cah_data_df_grouped = cah_data_df.groupby(["famid68", "indid68"]).agg({
    "hispanicity": agg_hispanicity,
    "race_code_cah_1": agg_race_cah,
    "race_code_cah_2": agg_race_cah,
    "race_code_cah_3": agg_race_cah
}).reset_index()

In [16]:
data_df = prelim_data_df.merge(cah_data_df_grouped, on=["famid68", "indid68"], how="left")

In [17]:
data_df = data_df[data_df.apply(filter_na_response, axis=1)]

In [18]:

data_df["race_code_cah_1"] = data_df["race_code_cah_1"].apply(lambda x: x if not x else [])
data_df["race_code_cah_2"] = data_df["race_code_cah_2"].apply(lambda x: x if not x else [])
data_df["race_code_cah_3"] = data_df["race_code_cah_3"].apply(lambda x: x if not x else [])

data_df["race"] = data_df["race_code"].apply(get_race)
data_df["race17_1"] = data_df["race_code17_1"].apply(get_race_ta17)
data_df["race17_2"] = data_df["race_code17_2"].apply(get_race_ta17)

data_df["asian"] = data_df.apply(is_race("asian"), axis=1)
data_df["black"] = data_df.apply(is_race("black"), axis=1)
data_df["white"] = data_df.apply(is_race("white"), axis=1)
data_df["hispanic"] = data_df.apply(lambda row: 1 if is_race("hispanic")(row) or row["hispanicity"] else 0, axis=1)

data_df["white_only"] = data_df.apply(lambda row: 1 if row["white"] == 1 and row["hispanic"] == 0 and row["black"] == 0 and row["asian"] == 0 else 0, axis=1)

data_df["live_w_both_parents"] = data_df.apply(live_w_both_parents, axis=1)
data_df["environment_type"] = data_df["rural_urban_code01"].apply(get_env_type)
data_df["region"] = data_df["geo_region_code01"].apply(get_region)

data_df["age"] = data_df["age_01"] + 16

data_df["grad_bach"] = data_df.apply(has_bachelors, axis=1).map({True: 1, False: 0})

In [19]:
# Handle income outliers by capping income and wealth at +2 std for now
inc_cap = data_df["total_fam_income00"].mean() + 2*data_df["total_fam_income00"].std()
data_df["total_fam_income00_cap"] = data_df["total_fam_income00"].clip(upper=inc_cap)
wealth_cap = data_df["wealth_w_equity01"].mean() + 2*data_df["wealth_w_equity01"].std()
data_df["wealth_w_equity01_cap"] = data_df["wealth_w_equity01"].clip(upper=wealth_cap)
data_df = data_df.replace({"math_score02": 999, "reading_score02": 999}, np.nan)

In [20]:
# Use income and wealth, our primary continuous variables, to impute math and reading scores
data_df = data_df.replace({"math_score02": 999, "reading_score02": 999}, np.nan)
temp_impute_df = data_df[["math_score02", "reading_score02", "wealth_w_equity01_cap", "total_fam_income00_cap"]]

imp = IterativeImputer(random_state=0, estimator=RandomForestRegressor())
data_df_imputed = pd.DataFrame(imp.fit_transform(temp_impute_df), index=temp_impute_df.index, columns=temp_impute_df.columns)

In [21]:
data_df = data_df[~data_df["environment_type"].isnull()]
data_df = data_df[data_df["age"] >= 22]

In [22]:
data_df[["math_score02", "reading_score02", "wealth_w_equity01_cap", "total_fam_income00_cap"]] = data_df_imputed[["math_score02", "reading_score02", "wealth_w_equity01_cap", "total_fam_income00_cap"]]

In [23]:
data_df = data_df.rename({"math_score02": "math_score", "reading_score02": "reading_score",\
                    "wealth_w_equity01_cap": "wealth", "total_fam_income00_cap": "income"}, axis=1)

In [24]:
data_df = data_df[['math_score', 'reading_score', 'wealth', 'income', 'grad_bach', 'survey_weight', 'environment_type', 'age', 'white_only', 'black', 'asian', 'live_w_both_parents', 'food_security', 'hispanic']]

In [25]:
data_df.to_csv("data_2.csv", index=False)