Import the Pandas and Numpy libraries and load in the data file
---

In [33]:
import pandas as pd
import numpy as np

demo = pd.read_csv('Demographics.csv')
bmx = pd.read_csv('BodyMeasures.csv')
ocq = pd.read_csv('Occupation.csv')

Impossible Data - Continuous Values
---

BMXWT - weight in kg

In [2]:
bmx['BMXWT'].describe()

count     9185.000000
mean        67.506522
std        282.389202
min       -149.000000
25%         39.100000
50%         63.000000
75%         79.700000
max      12870.000000
Name: BMXWT, dtype: float64

In [3]:
ind = bmx['BMXWT'] < 0
bmx.loc[ind,'BMXWT'] = np.nan
bmx['BMXWT'].describe()

count     9179.000000
mean        67.606840
std        282.452412
min          3.100000
25%         39.200000
50%         63.000000
75%         79.750000
max      12870.000000
Name: BMXWT, dtype: float64

Impossible Data - Categorical Values
---
BMIWT - Weight Comment ([codebook entry](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/BMX.htm#BMIWT))


|Code|Meaning|
|:--- |:--- |
|1|	Could not obtain|	
|2|	Exceeds capacity|	
|3|	Clothing|
|4|	Medical appliance|

In [4]:
bmx['BMIWT'].unique()

array([ 3., nan,  4.,  1., 11.,  7.])

In [5]:
ind = bmx['BMIWT'] > 4
bmx.loc[ind, 'BMIWT'] = np.nan
bmx['BMIWT'].unique()

array([ 3., nan,  4.,  1.])

Extreme Data
---

Heaviest human weight on record: 635 kg

In [6]:
bmx['BMXWT'].describe()

count     9179.000000
mean        67.606840
std        282.452412
min          3.100000
25%         39.200000
50%         63.000000
75%         79.750000
max      12870.000000
Name: BMXWT, dtype: float64

In [7]:
ind = bmx['BMXWT'] > 635
bmx.loc[ind,'BMXWT'] = np.nan
bmx['BMXWT'].describe()

count    9170.000000
mean       59.957381
std        29.842889
min         3.100000
25%        39.200000
50%        62.970000
75%        79.600000
max       193.300000
Name: BMXWT, dtype: float64

In [8]:
mean_wt = np.nanmean(bmx['BMXWT'])
std_wt = np.nanstd(bmx['BMXWT'])

min_wt = np.nanmin(bmx['BMXWT'])
max_wt = np.nanmax(bmx['BMXWT'])

low_wt_zscore = (min_wt - mean_wt)/std_wt
high_wt_zscore = (max_wt - mean_wt)/std_wt

print('Max weight z-score: ' + str(high_wt_zscore))
print('Min weight z-score: ' + str(low_wt_zscore))

Max weight z-score: 4.468397485658741
Min weight z-score: -1.9053276258294776


Saturated Data
---

Survey instructions: list ages 85 and above as 85

In [9]:
np.nanmax(demo['RIDAGEYR'])

109.0

In [10]:
ind = demo['RIDAGEYR'] > 85
demo.loc[ind,'RIDAGEYR'] = 85

Individual Practice
---

1. Find and remove any non-sensible categorical values in the occupation file
    * Check OCQ130, OCQ140, OCQ150, OCQ160
2. Find the z-score of the maximum and minimum values in the occupation column OCQ180 - Hours worked last week at all jobs ([see codebook for more information](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/OCQ.htm#OCQ180))
    * Exclude values 7777 and 9999


In [31]:
len(ocq.columns)
ocq['OCQ130'].unique()

array([nan,  7.,  3.,  2.,  1.,  4.,  6.,  5., 38., 99., 21., 54., 68.,
       29., 17., 58., 69., 48., 14., 34., 42.])

In [None]:
ind = ocq['OCQ130'].isin([1,2,3,4,5,6,7,77,99])
ocq.loc[~ind,'OCQ130'] = np.nan

In [29]:
ocq['OCQ130'].unique()


array([nan,  7.,  3.,  2.,  1.,  4.,  6.,  5., 99.])

In [36]:
ocq['OCQ150'].unique()

array([ 2.,  1.,  4., nan,  3., 66., 99., 44.,  7.,  9., 11.,  8., 33.,
       77.])

In [37]:
ind= ocq['OCQ150'].isin([1,2,3,4,7,9])

In [39]:
ocq.loc[~ind,'OCQ150']=np.nan

In [40]:
ocq['OCQ150'].unique()

array([ 2.,  1.,  4., nan,  3.,  7.,  9.])

In [41]:
ocq['OCQ180'].describe()

count     3193.000000
mean        70.381459
std       1769.058543
min          1.000000
25%         30.000000
50%         40.000000
75%         46.000000
max      99999.000000
Name: OCQ180, dtype: float64

In [53]:
ind = ocq['OCQ180']>1000
ocq180= ocq.loc[~ind,['OCQ180']]

In [51]:
omax = np.nanmax(ocq180)
omin = np.nanmean(ocq180)
omean = np.nanmean(ocq180)
ostd = np.nanstd(ocq180)

In [54]:
(omax-omean)/ostd

4.3764058992483355

Save The Data
---

1316

In [12]:
# demo.to_csv('Demographics.csv', index=False)
# bmx.to_csv('BodyMeasures.csv', index=False)
# ocq.to_csv('Occupation.csv', index=False)