# Data Imputation
- Temperature has the highest % of missing values at 47%
- Removed any counties that were missing values over all of the recorded dates
    - 744 counties from 1877 were removed (40%)
- Used SimpleImputer for categorical features
- Used KNN imputation for numeric features
- Imputed using non-missing values from columns

In [161]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [102]:
data = pd.read_csv("/Users/Huey.ts/Desktop/STAT_390/data/data_final/us-covid-raw-data.csv")

In [10]:
# NULL Values
null = data.isnull().sum()
null[null != 0]

social_distancing_total_grade              122020
social_distancing_visitation_grade         324202
social_distancing_encounters_grade         122020
social_distancing_travel_distance_grade    122020
daily_state_test                           132248
precipitation                              256910
temperature                                352950
ventilator_capacity_ratio                   14340
icu_beds_ratio                              14340
Religious_congregation_ratio                  478
percent_insured                               239
deaths_per_100000                           20076
dtype: int64

In [11]:
# Percent % of NULL Values
null_per = data.isnull().mean()*100
null_per[null_per != 0]

social_distancing_total_grade              16.249011
social_distancing_visitation_grade         43.172938
social_distancing_encounters_grade         16.249011
social_distancing_travel_distance_grade    16.249011
daily_state_test                           17.611041
precipitation                              34.211879
temperature                                47.001217
ventilator_capacity_ratio                   1.909612
icu_beds_ratio                              1.909612
Religious_congregation_ratio                0.063654
percent_insured                             0.031827
deaths_per_100000                           2.673456
dtype: float64

In [114]:
null_features = [null_per[null_per != 0].index]

## Found and removed counties that were missing values over all of the recorded dates

In [115]:
null_dat = data.groupby("county_name", dropna=False).apply(lambda x: x.isnull().mean()*100)
null_dat

Unnamed: 0_level_0,date,county_fips,county_name,state_fips,state_name,covid_19_confirmed_cases,covid_19_deaths,social_distancing_total_grade,social_distancing_visitation_grade,social_distancing_encounters_grade,...,age_45_49,age_50_54,age_55_59,age_60_64,age_65_69,age_70_74,age_75_79,age_80_84,age_85_or_higher,immigrant_student_ratio
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbeville County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,100.000000,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acadia Parish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,13.807531,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Accomack County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,13.807531,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ada County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,13.807531,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adair County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,56.903766,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yukon-Koyukuk Census Area,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.000000,100.000000,100.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yuma County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,56.903766,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zapata County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,13.807531,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zavala County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.807531,100.000000,13.807531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
counties = []
for i in null_features:
    counties.append(null_dat.iloc[np.where(null_dat[i] == 100)[0]].index)

In [117]:
counties = np.concatenate(counties)
counties = np.unique(counties)

In [118]:
# Number of counties to remove
len(counties)

744

In [119]:
new_data = data.loc[~data["county_name"].isin(counties)].reset_index()
new_data

Unnamed: 0,index,date,county_fips,county_name,state_fips,state_name,covid_19_confirmed_cases,covid_19_deaths,social_distancing_total_grade,social_distancing_visitation_grade,...,age_45_49,age_50_54,age_55_59,age_60_64,age_65_69,age_70_74,age_75_79,age_80_84,age_85_or_higher,immigrant_student_ratio
0,239,01/22/20,1003,Baldwin County,1,Alabama,0.0,0.0,,,...,7,7,7,7,6,4,3,2,2,0.021048
1,240,01/23/20,1003,Baldwin County,1,Alabama,0.0,0.0,,,...,7,7,7,7,6,4,3,2,2,0.021048
2,241,01/24/20,1003,Baldwin County,1,Alabama,0.0,0.0,,,...,7,7,7,7,6,4,3,2,2,0.021048
3,242,01/25/20,1003,Baldwin County,1,Alabama,0.0,0.0,,,...,7,7,7,7,6,4,3,2,2,0.021048
4,243,01/26/20,1003,Baldwin County,1,Alabama,0.0,0.0,,,...,7,7,7,7,6,4,3,2,2,0.021048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557821,749977,09/12/20,56037,Sweetwater County,56,Wyoming,0.0,0.0,D+,F,...,7,8,7,5,3,2,1,1,1,0.025714
557822,749978,09/13/20,56037,Sweetwater County,56,Wyoming,0.0,0.0,D+,F,...,7,8,7,5,3,2,1,1,1,0.025714
557823,749979,09/14/20,56037,Sweetwater County,56,Wyoming,0.0,0.0,D+,F,...,7,8,7,5,3,2,1,1,1,0.025714
557824,749980,09/15/20,56037,Sweetwater County,56,Wyoming,0.0,0.0,D+,F,...,7,8,7,5,3,2,1,1,1,0.025714


## Imputation 

In [139]:
pd.DataFrame(data.dtypes).loc[null_features[0]]

Unnamed: 0,0
social_distancing_total_grade,object
social_distancing_visitation_grade,object
social_distancing_encounters_grade,object
social_distancing_travel_distance_grade,object
daily_state_test,float64
precipitation,float64
temperature,float64
ventilator_capacity_ratio,float64
icu_beds_ratio,float64
Religious_congregation_ratio,float64


### SimpleImputer - Categorical Columns

In [167]:
cat_cols = ["social_distancing_total_grade", "social_distancing_visitation_grade",
            "social_distancing_encounters_grade", "social_distancing_travel_distance_grade"]

In [168]:
cat_to_impute = new_data[[i for i in cat_cols]]

In [169]:
imputer = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
imputed_cat_data = pd.DataFrame(imputer.fit_transform(cat_to_impute), columns = cat_to_impute.columns)

In [170]:
imputed_cat_data

Unnamed: 0,social_distancing_total_grade,social_distancing_visitation_grade,social_distancing_encounters_grade,social_distancing_travel_distance_grade
0,F,F,F,F
1,F,F,F,F
2,F,F,F,F
3,F,F,F,F
4,F,F,F,F
...,...,...,...,...
557821,D+,F,A,F
557822,D+,F,A,F
557823,D+,F,A,F
557824,D+,F,A,F


### KNN - Numerical Columns

In [171]:
num_cols = ["daily_state_test", "precipitation", "temperature", "ventilator_capacity_ratio",
           "icu_beds_ratio", "Religious_congregation_ratio", "percent_insured",
           "deaths_per_100000"]

In [172]:
num_to_impute = new_data[[i for i in num_cols]]

In [173]:
imputer = KNNImputer(n_neighbors=5)
imputed_col_data = pd.DataFrame(imputer.fit_transform(num_to_impute), columns = num_to_impute.columns)

## Merge Imputed Data

In [201]:
non_null_features = list(set(new_data.columns)-set(null_features[0]))
non_null_data = new_data[[i for i in non_null_features]]

In [205]:
imputed_data = pd.concat([non_null_data, imputed_cat_data, imputed_col_data], axis = 1)
imputed_data

Unnamed: 0,less_than_high_school_diploma,airport_distance,age_50_54,covid_19_deaths,longitude,houses_density,age_35_39,percent_diabetes,state_fips,date,...,social_distancing_encounters_grade,social_distancing_travel_distance_grade,daily_state_test,precipitation,temperature,ventilator_capacity_ratio,icu_beds_ratio,Religious_congregation_ratio,percent_insured,deaths_per_100000
0,9.7,58.533710,7,0.0,-87.722071,73.3630,6,10.7,1,01/22/20,...,F,F,4580.0,0.0,2.0,0.000261,0.000234,53.0,88.666596,1081.1
1,9.7,58.533710,7,0.0,-87.722071,73.3630,6,10.7,1,01/23/20,...,F,F,4483.0,0.0,4.7,0.000261,0.000234,53.0,88.666596,1081.1
2,9.7,58.533710,7,0.0,-87.722071,73.3630,6,10.7,1,01/24/20,...,F,F,5397.6,79.0,11.4,0.000261,0.000234,53.0,88.666596,1081.1
3,9.7,58.533710,7,0.0,-87.722071,73.3630,6,10.7,1,01/25/20,...,F,F,3891.0,0.0,10.0,0.000261,0.000234,53.0,88.666596,1081.1
4,9.7,58.533710,7,0.0,-87.722071,73.3630,6,10.7,1,01/26/20,...,F,F,2591.0,5.0,10.8,0.000261,0.000234,53.0,88.666596,1081.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557821,9.0,172.698662,8,0.0,-108.882788,1.9024,6,8.8,56,09/12/20,...,A,F,33.0,0.0,11.1,0.000325,0.000302,48.0,85.535005,752.6
557822,9.0,172.698662,8,0.0,-108.882788,1.9024,6,8.8,56,09/13/20,...,A,F,49.0,0.0,12.5,0.000325,0.000302,48.0,85.535005,752.6
557823,9.0,172.698662,8,0.0,-108.882788,1.9024,6,8.8,56,09/14/20,...,A,F,2569.0,0.0,14.8,0.000325,0.000302,48.0,85.535005,752.6
557824,9.0,172.698662,8,0.0,-108.882788,1.9024,6,8.8,56,09/15/20,...,A,F,739.0,0.0,15.3,0.000325,0.000302,48.0,85.535005,752.6


In [206]:
imputed_data.isnull().mean()*100

less_than_high_school_diploma              0.0
airport_distance                           0.0
age_50_54                                  0.0
covid_19_deaths                            0.0
longitude                                  0.0
houses_density                             0.0
age_35_39                                  0.0
percent_diabetes                           0.0
state_fips                                 0.0
date                                       0.0
high_school_diploma_only                   0.0
percent_smokers                            0.0
age_65_69                                  0.0
age_60_64                                  0.0
age_15_19                                  0.0
meat_plants                                0.0
female_percent                             0.0
index                                      0.0
county_fips                                0.0
median_household_income                    0.0
age_10_14                                  0.0
total_populat

## Export Imputed Data as CSV

In [209]:
compression_opts = dict(method='zip',archive_name='imputed_data.csv')

In [210]:
imputed_data.to_csv("/Users/Huey.ts/Desktop/STAT_390/data/imputed_data.zip",compression=compression_opts) 