Import the Pandas and Numpy libraries and load in the data file
---

In [None]:
import pandas as pd
import numpy as np

demo = pd.read_csv('Demographics.csv')
bmx = pd.read_csv('BodyMeasures.csv')

How much data is missing from each column?
---

In [None]:
valid_entries = demo.count()
total_rows = len(demo.index)
missing_data = total_rows - valid_entries
missing_data.head()

As a percentage
---

In [None]:
missing_percentage = missing_data / total_rows * 100
missing_percentage.head()

How much data is missing from each row?
---

In [None]:
missing_data = np.sum(demo.isnull(), axis=1)
num_cols = len(demo.columns)
missing_percentage = missing_data / num_cols * 100

missing_percentage.head()

What about entries equivalent to unknown?
---

DMDSCHOL - Now attending school? ([codebook entry](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO.htm#DMDSCHOL))

|Code|Meaning|
|:--- |:--- |
|1| In school|
|2|	On vacation from school (between grades)|
|3|	Neither in school or on vacation from school (between grades)|
|7|	Refused|
|9|	Don't know|

In [None]:
perc = (len(demo.index)-demo['DMDSCHOL'].count())/len(demo.index)*100
print('Percent missing: %d' % perc)

num_refused = sum(demo['DMDSCHOL'] == 7)
num_dontknow = sum(demo['DMDSCHOL'] == 9)
print('Number refused: %d' % num_refused)
print('Number unknown: %d' % num_dontknow)

In [None]:
unknown_ind = demo.loc[:,'DMDSCHOL'] > 3
demo.loc[unknown_ind,'DMDSCHOL'] = np.nan
print(demo['DMDSCHOL'].unique())

In [None]:
perc = (len(demo.index)-demo['DMDSCHOL'].count())/len(demo.index)*100
print('Percent missing: %d' % perc)

num_refused = sum(demo['DMDSCHOL'] == 7)
num_dontknow = sum(demo['DMDSCHOL'] == 9)
print('Number refused: %d' % num_refused)
print('Number unknown: %d' % num_dontknow)

Wrong Information
---

In [None]:
print('Demographics:')
print(demo.dtypes.head())
print('\nBody Measures:')
print(bmx.dtypes.head())

In [None]:
bmx.loc[:,'SEQN'] = pd.to_numeric(bmx['SEQN'], downcast='integer')

In [None]:
bmx.loc[:,'SEQN'] = pd.to_numeric(bmx['SEQN'], errors='coerce', downcast='integer')
bmx.dtypes.head()

In [None]:
ind = np.isnan(bmx['SEQN'])
bmx = bmx.loc[~ind,:]
bmx.loc[:,'SEQN'] = pd.to_numeric(bmx['SEQN'], errors='coerce', downcast='integer')
bmx.dtypes.head()

Sensitive Data: Minor Marital Status
---

In [None]:
minor_ind = demo.loc[:,'RIDAGEYR'] < 18
print(demo.loc[minor_ind, 'DMDMARTL'].count())

In [None]:
demo.loc[minor_ind, 'DMDMARTL'] = np.nan
print(demo.loc[minor_ind, 'DMDMARTL'].count())

Individual Practice
---

1. Find the column with the highest percentage of missing information in demographics
2. Remove confidential pregnancy status for minors (rows 'RIDEXPRG' and 'RIDPREG')


In [None]:
# Put your code here

Save Data Files
---

In [None]:
# demo.to_csv('Demographics.csv', index=False)
# bmx.to_csv('BodyMeasures.csv', index=False)