In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

## Data Cleaning part 2 and EDA

In [15]:
df = pd.read_parquet("../data/usa_00001_clean.parquet")

df.columns

Index(['YEAR', 'SAMPLE', 'SERIAL', 'CBSERIAL', 'HHWT', 'CLUSTER', 'STATEFIP',
       'STRATA', 'GQ', 'PERNUM', 'PERWT', 'SEX', 'AGE', 'MARST', 'RACE',
       'RACED', 'BPL', 'BPLD', 'CITIZEN', 'YRIMMIG', 'LANGUAGE', 'LANGUAGED',
       'SPEAKENG', 'EDUC', 'EDUCD', 'EMPSTAT', 'EMPSTATD', 'LABFORCE',
       'CLASSWKR', 'CLASSWKRD', 'OCC', 'IND', 'WKSWORK2', 'UHRSWORK',
       'FTOTINC', 'INCWAGE'],
      dtype='object')

In [16]:
df_cleaned = df[(df["INCWAGE"] > 0) & df["EMPSTAT"].isin([1, 2])] # Only people with positive income and either employeed or unemployeed
df_cleaned = df_cleaned[["AGE", "SEX", "RACE", "MARST", "STATEFIP", # Demographics
                         "YRIMMIG", "EDUC", "OCC", "IND", # Personal info
                         "PERWT", # Personal Weight, may be helpful
                         "INCWAGE", # Treatment
                         "CITIZEN", # Outcome
                         ]]
df_cleaned = df_cleaned[df_cleaned["CITIZEN"].isin([2, 3])]
df_cleaned["CITIZEN"] = df_cleaned["CITIZEN"].map({2:1, 3:0}) # 1 if naturalized
df_cleaned.head()

Unnamed: 0,AGE,SEX,RACE,MARST,STATEFIP,YRIMMIG,EDUC,OCC,IND,PERWT,INCWAGE,CITIZEN
842,49,2,6,6,1,1976,6,8800,3470,5.0,7000.0,1
1125,49,2,6,6,1,1976,6,8800,3470,36.0,7000.0,1
1875,49,2,6,6,1,1976,6,8800,3470,38.0,7000.0,1
2945,53,2,7,1,1,1980,3,9600,6390,108.0,37400.0,1
2946,46,1,7,1,1,2000,0,4230,7690,115.0,360.0,0


In [17]:
print(df_cleaned.shape)

(223823, 12)


In [18]:
print(len(df_cleaned.OCC.unique()))
print(len(df_cleaned.IND.unique()))

529
264


In [19]:
df_cleaned["INCWAGE_LOG"] = np.log(df['INCWAGE'])
categorical_vars = ['SEX', 'RACE', 'MARST', 'EDUC', 'STATEFIP', 'OCC', 'IND']
for col in categorical_vars:
    df_cleaned[col] = df_cleaned[col].astype('category')
df_cleaned.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,AGE,SEX,RACE,MARST,STATEFIP,YRIMMIG,EDUC,OCC,IND,PERWT,INCWAGE,CITIZEN,INCWAGE_LOG
842,49,2,6,6,1,1976,6,8800,3470,5.0,7000.0,1,8.853665
1125,49,2,6,6,1,1976,6,8800,3470,36.0,7000.0,1,8.853665
1875,49,2,6,6,1,1976,6,8800,3470,38.0,7000.0,1,8.853665
2945,53,2,7,1,1,1980,3,9600,6390,108.0,37400.0,1,10.529426
2946,46,1,7,1,1,2000,0,4230,7690,115.0,360.0,0,5.886104


In [20]:
df_cleaned.to_csv("../data/cleaned.csv", index = False)