# Logistic Regression 

## Data Exploration/Inspection

In [80]:
import pandas as pd

data = pd.read_csv('data/COVID-19_Case_Surveillance_Public_Use_Data_with_Geography_20241111.csv')

In [81]:
data.head(5)

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
1,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,1.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,No,No,No,Yes
2,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,Yes,No,No,Yes
3,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,No,No,No,Yes
4,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,No,No,No,Yes


In [82]:
categorical_cols = ['state_fips_code', 'county_fips_code', 'age_group',
                  'sex', 'race', 'ethnicity', 'process', 'exposure_yn', 'symptom_status',
                  'hosp_yn', 'icu_yn', 'underlying_conditions_yn']

In [83]:
import textwrap

In [84]:
def print_na_count(df, col):
    na_count = df[df[col].isna()].shape[0]
    print(f'{col} null values: {na_count}')


In [85]:
def print_unique(df, col):
    u = df[col].unique()
    print(u.shape)
    if u.shape[0] <= 10:
        print(u)
    else:
        print(textwrap.shorten(str(u), width=70, placeholder="..."))

In [86]:
for col in categorical_cols:
    print_unique(data, col)
    print_na_count(data, col)

(30,)
[51 39 23 16 55 21 11 41 31 20 32 28 78 12 50 36 2 18 49 42 19 72...
state_fips_code null values: 0
(464,)
[51810. nan 51177. 39049. 23019. 23031. 51087. 51710. 23005. 39139....
county_fips_code null values: 11756
(4,)
['65+ years' '18 to 49 years' '50 to 64 years' '0 - 17 years']
age_group null values: 0
(2,)
['Female' 'Male']
sex null values: 0
(6,)
['White' 'Black' 'Multiple/Other' 'Asian' 'American Indian/Alaska Native'
 'Native Hawaiian/Other Pacific Islander']
race null values: 0
(2,)
['Non-Hispanic/Latino' 'Hispanic/Latino']
ethnicity null values: 0
(9,)
['Routine surveillance' 'Laboratory reported' 'Provider reported'
 'Clinical evaluation' 'Multiple' 'Other'
 'Contact tracing of case patient' 'Routine physical examination'
 'Autopsy']
process null values: 0
(1,)
['Yes']
exposure_yn null values: 0
(2,)
['Symptomatic' 'Asymptomatic']
symptom_status null values: 0
(2,)
['No' 'Yes']
hosp_yn null values: 0
(2,)
['No' 'Yes']
icu_yn null values: 0
(2,)
['Yes' 'No']
underlying_c

County fips code seems to be the only categorical column with nulls

## One Hot Encoding

In [87]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [88]:
np.random.seed(777)

In [89]:
encoder = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
    transformers=[
        ('onehot', encoder, categorical_cols)
    ],
    remainder='passthrough'  # Keep other columns as is
)
transformed_array = ct.fit_transform(data)
print(transformed_array.shape)

(175407, 533)


categorical columns are now 533 one hot columns

In [90]:
onehot_features = ct.named_transformers_['onehot'].get_feature_names_out(categorical_cols)
numeric_features = data.columns.difference(categorical_cols)
all_features = np.concatenate([onehot_features, numeric_features])

print(f'onehot {onehot_features.shape} + numeric {numeric_features.shape} = all {all_features.shape}')
print('random one hot features:')
print(onehot_features[np.random.randint(0, onehot_features.shape[0], size=(10))])

onehot (526,) + numeric (7,) = all (533,)
random one hot features:
['county_fips_code_19027.0' 'county_fips_code_18039.0'
 'county_fips_code_19127.0' 'county_fips_code_51085.0'
 'ethnicity_Hispanic/Latino' 'county_fips_code_39051.0'
 'county_fips_code_50021.0' 'county_fips_code_8013.0'
 'county_fips_code_39105.0' 'county_fips_code_39151.0']


In [95]:
transformed_data = pd.DataFrame(
    transformed_array,
    columns=all_features
)