### Data Cleaner

In [38]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

* Import csv

In [39]:
csvpath = os.path.join("data", "ny_chs_2013.csv")
ny_chs_df = pd.read_csv(csvpath)
#ny_chs_df.head()

* Make new data frame with selected variables

In [40]:
truncated_df = ny_chs_df.loc[:, ["generalhealth", "insured", "neighpovgroup4_0812",
                                    "exercise13", "fruitveg", "agegroup5", "sex", "newrace","education", "smoker", "drinker"]]
#truncated_df.head()

In [41]:
truncated_df["generalhealth"].value_counts()

good         2838
very good    2151
excellent    1551
fair         1459
poor          644
Name: generalhealth, dtype: int64

* Create numerical values for DV

In [42]:
cleanup_health = {"generalhealth": {"excellent": 5, "very good": 4, "good": 3, "fair": 2, "poor": 1}
                 }

In [43]:
truncated_df.replace(cleanup_health, inplace=True)
#truncated_df.head()

* Drop any sample with a missing value

In [45]:
truncated_df = truncated_df[~truncated_df.isin(['NaN']).any(axis=1)]
truncated_df.count()

generalhealth          7913
insured                7913
neighpovgroup4_0812    7913
exercise13             7913
fruitveg               7913
agegroup5              7913
sex                    7913
newrace                7913
education              7913
smoker                 7913
drinker                7913
dtype: int64

* Create dummy variables

In [46]:
insured_dummy = truncated_df['insured'].replace(2, 0)
truncated_df['Insured'] = insured_dummy
del truncated_df['insured']
exercise_dummy = truncated_df['exercise13'].replace(2, 0)
truncated_df['Exercise'] = exercise_dummy
del truncated_df['exercise13']
sex_dummy = truncated_df['sex'].replace(2, 0)
truncated_df['Sex'] = sex_dummy
del truncated_df['sex']
truncated_df.head()

# New variables for smoking and drinking
drinker_dummy = truncated_df['drinker'].replace(2, 0)
truncated_df['Drinker'] = drinker_dummy
del truncated_df['drinker']
smoker_dummy = truncated_df['smoker'].replace(2, 0)
truncated_df['Smoker'] = drinker_dummy
del truncated_df['smoker']

* Rename columns

In [47]:
truncated_df = truncated_df.rename(index=str, columns={"generalhealth": "General Health",
                                                      "neighpovgroup4_0812": "(%) of Population Under FPL",
                                                      "fruitveg": "Eaten Fruits or Veggies Yesterday",
                                                      "agegroup5": "Age Group",
                                                      "newrace": "Race",
                                                      "education": "Education"                                                  
                                                     })

In [48]:
truncated_df.head()

Unnamed: 0,General Health,(%) of Population Under FPL,Eaten Fruits or Veggies Yesterday,Age Group,Race,Education,Insured,Exercise,Sex,Drinker,Smoker
0,4.0,2.0,2.0,4.0,2,3.0,1.0,1.0,0,0.0,0.0
1,4.0,3.0,2.0,2.0,4,2.0,1.0,1.0,0,0.0,0.0
2,4.0,2.0,1.0,5.0,1,4.0,1.0,1.0,1,1.0,1.0
4,2.0,1.0,2.0,5.0,1,2.0,1.0,1.0,0,0.0,0.0
5,3.0,1.0,2.0,4.0,1,4.0,1.0,1.0,0,1.0,1.0


* Write new csv of clean data

In [49]:
clean_csv = os.path.join("data", "clean_data.csv")
truncated_df.to_csv(clean_csv)