# Data Cleaning

## Team : 14 (Vidisha, Yijin, Yvette)


In [1]:
import pandas as pd
data_20 = pd.read_csv('adult20csv/adult20.csv') # Reading Data for 2020
data_20

Unnamed: 0,URBRRL,RATCAT_A,INCGRP_A,INCTCFLG_A,FAMINCTC_A,IMPINCFLG_A,RJWKCLSOFT_A,RJWCLSNOSD_A,RJWRKCLSSD_A,RECJOBSD_A,...,PHSTAT_A,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_MON,RECTYPE,WTFA_A,HHX,POVRATTC_A
0,3,14,5,0,100000,0,,,,,...,2,,,1,1,11,10,4526.109,H066706,6.47
1,3,11,4,0,75000,0,,,,,...,2,,,1,1,8,10,12809.039,H034928,3.64
2,3,14,4,0,90000,0,,,,,...,3,,,1,1,8,10,10322.534,H018289,6.76
3,3,11,3,0,65000,0,,,,,...,1,,,1,1,3,10,7743.375,H006876,3.79
4,3,8,1,0,25762,2,,,,,...,3,,,1,1,6,10,4144.724,H028842,2.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31563,4,13,4,0,79000,0,,,,,...,3,,,1,1,2,10,2857.585,H065697,4.61
31564,4,11,3,0,60000,0,,,,,...,3,,,1,1,2,10,2994.763,H061937,3.50
31565,4,8,1,0,27500,0,,,,,...,2,,,1,1,2,10,1328.907,H005331,2.24
31566,4,8,3,0,61880,0,,,,,...,2,,,1,1,2,10,3481.003,H047025,2.38


### Step 1: Dealing with missing values

Since we are using survey data, there are bound to be several missing values in various attributes due to non-response. In order to deal with such missing observations, we consider only those columns in the data set with number of missing values less than 10% of the total observations. 

We consider 10% as a reasonable threshold for a data set with 31,586 observations. For columns with number of missing vales less than 10% of the total observations, we drop the missing values as these are a small proportion of the total observations.

In [2]:
prop_missing = data_20.isna().sum()/ len(data_20) # Proportion of missing values in every attribute
keep = list(prop_missing[prop_missing < 0.1].index) # Attributes with less than 10% missing observations
data_20 = data_20[keep] 

### Remaining Missing Values


In [3]:
percent_missing = data_20.isnull().sum() * 100 / len(data_20) #quantifying proportion of missing observations
missing_values_df = pd.DataFrame({'column_name': data_20.columns,
                                 'percent_missing': percent_missing}) #displaying variable name and proportion of missing observations
print(missing_values_df)

data_20 = data_20.dropna() #Dropping missing values in remaining variables

print(data_20)

           column_name  percent_missing
URBRRL          URBRRL              0.0
RATCAT_A      RATCAT_A              0.0
INCGRP_A      INCGRP_A              0.0
INCTCFLG_A  INCTCFLG_A              0.0
FAMINCTC_A  FAMINCTC_A              0.0
...                ...              ...
INTV_MON      INTV_MON              0.0
RECTYPE        RECTYPE              0.0
WTFA_A          WTFA_A              0.0
HHX                HHX              0.0
POVRATTC_A  POVRATTC_A              0.0

[212 rows x 2 columns]
       URBRRL  RATCAT_A  INCGRP_A  INCTCFLG_A  FAMINCTC_A  IMPINCFLG_A  PPSU  \
0           3        14         5           0      100000            0     2   
3           3        11         3           0       65000            0     2   
5           3         9         2           0       36000            0     2   
6           3         3         1           0       30105            2     2   
7           2         2         1           0        9000            1     2   
...       ...   

### Step 2: Removing irrelevant columns

Our original dataset included over 600 variables. In this step, we manually selected variables using the dataset codebook (https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NHIS/2020/adult-codebook.pdf) and created an excel sheet (relevant_variables.xlsx) with all the variables relevant for our study topic. We decided to take this approach given the large number of variables that were similar. For example, there were several variables related to food insecurity (e.g., Couldn't afford to eat balanced meals; food didn't last; worry food would run out;receive food stamps, past 12m). For each group of similar variables, we chose the one captured the most information.

We also chose to manually select our remaining variables since there were many features irrelevant to our research questions. We chose to manually eliminate these variables as well.

In [4]:
relevant_variables = pd.read_excel('relevant_variables.xlsx')
data_20 = data_20[list(relevant_variables["Variable Names"])]

data_20

Unnamed: 0,AGEP_A,ANXEV_A,ANXFREQ_A,ANXMED_A,ARTHEV_A,BMICAT_A,CHDEV_A,CHLEV_A,COGMEMDFF_A,COPDEV_A,...,SEX_A,SLPFLL_A,SLPHOURS_A,SLPREST_A,SLPSTY_A,SMKCIGST_A,SOCSCLPAR_A,STRFREQW_A,URBRRL,VIGFREQW_A
0,85,2,5,2,2,2,2,2,2,2,...,1,2,9,4,1,4,1,94,3,94
3,32,2,3,2,2,2,2,2,1,2,...,1,1,8,3,2,4,1,3,3,5
5,70,2,4,2,1,3,2,1,1,2,...,2,2,7,2,2,4,1,2,3,94
6,32,2,1,2,2,4,2,2,1,2,...,2,1,8,1,1,4,1,94,3,94
7,77,2,5,2,2,9,2,2,1,2,...,2,2,8,2,2,4,1,94,2,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31558,56,2,4,2,1,4,2,2,1,2,...,1,2,8,3,1,3,1,94,4,0
31560,60,2,4,2,2,2,2,1,1,2,...,2,1,7,3,2,3,1,2,4,2
31564,55,2,4,2,2,4,2,1,1,2,...,2,2,8,3,2,4,1,2,4,94
31565,66,2,4,2,1,4,2,2,2,2,...,2,1,6,3,2,3,1,94,4,94


### 

### Step 3: Assessing Categorical Variables

Now that we have our final features, we assess how many categories each categorical variables has.

In [5]:
for col in list(data_20):
    print('variable:', col) 
    print(data_20[col].value_counts()) #printing number of observations for each category

variable: AGEP_A
85    516
69    484
64    483
67    476
70    475
     ... 
19    137
84    117
18    114
97     41
99      2
Name: AGEP_A, Length: 70, dtype: int64
variable: ANXEV_A
2    18640
1     3224
7       17
9        9
Name: ANXEV_A, dtype: int64
variable: ANXFREQ_A
4    7337
5    6736
2    2966
1    2395
3    2374
9      58
7      24
Name: ANXFREQ_A, dtype: int64
variable: ANXMED_A
2    19068
1     2795
7       16
9       11
Name: ANXMED_A, dtype: int64
variable: ARTHEV_A
2    16360
1     5505
9       14
7       11
Name: ARTHEV_A, dtype: int64
variable: BMICAT_A
3    7707
2    6871
4    6593
9     434
1     285
Name: BMICAT_A, dtype: int64
variable: CHDEV_A
2    20693
1     1158
9       29
7       10
Name: CHDEV_A, dtype: int64
variable: CHLEV_A
2    14754
1     7087
9       39
7       10
Name: CHLEV_A, dtype: int64
variable: COGMEMDFF_A
1    18312
2     3237
3      329
9        5
7        4
4        3
Name: COGMEMDFF_A, dtype: int64
variable: COPDEV_A
2    20808
1     1059
9

### Step 4: Transforming Categorical Variables

In our dataset, categorical variables are encoding using integers. Below, we convert the categorical features that are int64 to the categorical datatype. This allows us to transform the categorical variables into dummy variables.

In [6]:
print(data_20.dtypes) #check which variables are categorical

for col in list(relevant_variables.loc[relevant_variables["Type"] == "Categorical"]["Variable Names"]):
    data_20[col] = pd.Categorical(data_20[col])
    #data_20 = data_20.loc[data_20[col] != 7]
       
# Drop null/other responses

data_20 = pd.get_dummies(data_20, drop_first = True) #convert categorical into dummy variables

print(data_20)

data_20.to_csv("data_2020_cleaned.csv")

AGEP_A            int64
ANXEV_A           int64
ANXFREQ_A         int64
ANXMED_A          int64
ARTHEV_A          int64
BMICAT_A          int64
CHDEV_A           int64
CHLEV_A           int64
COGMEMDFF_A       int64
COPDEV_A          int64
DEMENEV_A         int64
DEPEV_A           int64
DEPFREQ_A         int64
DEPMED_A          int64
DIBEV_A           int64
DRKSTAT_A         int64
EDUC_A            int64
EMERG12MTC_A      int64
FAMINCTC_A        int64
FGEFRQTRD_A       int64
FUNWLK_A          int64
HYPEV_A           int64
INCGRP_A          int64
INCWELF_A       float64
MARSTAT_A         int64
MHTHRPY_A         int64
MODFREQW_A        int64
NOTCOV_A          int64
ORIENT_A          int64
PEOPLEWLK_A       int64
RACEALLP_A        int64
REGION            int64
SCHCURENR_A       int64
SEX_A             int64
SLPFLL_A          int64
SLPHOURS_A        int64
SLPREST_A         int64
SLPSTY_A          int64
SMKCIGST_A        int64
SOCSCLPAR_A       int64
STRFREQW_A        int64
URBRRL          

### Step 5: Visualization

In [7]:
print(data_20)

       AGEP_A  EMERG12MTC_A  FAMINCTC_A  MODFREQW_A  STRFREQW_A  VIGFREQW_A  \
0          85             2      100000          94          94          94   
3          32             0       65000           5           3           5   
5          70             0       36000           7           2          94   
6          32             0       30105           1          94          94   
7          77             2        9000          94          94          94   
...       ...           ...         ...         ...         ...         ...   
31558      56             0       85000           1          94           0   
31560      60             1       34000           5           2           2   
31564      55             0       60000           3           2          94   
31565      66             0       27500          94          94          94   
31566      37             0       61880          94          94          94   

       ANXEV_A_2  ANXEV_A_7  ANXEV_A_9  ANXFREQ_A_2

In [8]:
print(data_20.dtypes)

AGEP_A           int64
EMERG12MTC_A     int64
FAMINCTC_A       int64
MODFREQW_A       int64
STRFREQW_A       int64
                 ...  
SOCSCLPAR_A_7    uint8
SOCSCLPAR_A_9    uint8
URBRRL_2         uint8
URBRRL_3         uint8
URBRRL_4         uint8
Length: 193, dtype: object


### Step 6: Preliminary Test