# Logistic Regression 

## Data Exploration/Inspection

In [29]:
import pandas as pd

df = pd.read_csv('data/COVID-19_Case_Surveillance_Public_Use_Data_with_Geography_20241111.csv')

In [30]:
df.head(5)

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
1,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,1.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,No,No,No,Yes
2,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,Yes,No,No,Yes
3,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,No,No,No,Yes
4,2024-06,VA,51,VIRGINIA BEACH CITY,51810.0,65+ years,Female,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Probable Case,Symptomatic,No,No,No,Yes


In [31]:
df = df.drop(['res_state', 'res_county'], axis=1)

In [32]:
categorical_cols = ['case_month','state_fips_code', 'county_fips_code', 'age_group',
                  'sex', 'race', 'ethnicity', 'process', 'exposure_yn', 'current_status', 
                  'symptom_status', 'hosp_yn', 'icu_yn', 'underlying_conditions_yn']
numerical_cols = ['case_positive_specimen_interval', 'case_onset_interval']

In [33]:
import textwrap

In [34]:
def print_na_count(df, col):
    na_count = df[df[col].isna()].shape[0]
    print(f'{col} null values: {na_count}')


In [35]:
def print_unique(df, col):
    u = df[col].unique()
    print(u.shape)
    if u.shape[0] <= 10:
        print(u)
    else:
        print(textwrap.shorten(str(u), width=70, placeholder="..."))

In [36]:
for col in categorical_cols:
    print(col)
    print_unique(df, col)
    print_na_count(df, col)


case_month
(52,)
['2024-06' '2024-05' '2024-04' '2024-03' '2024-02' '2024-01'...
case_month null values: 0
state_fips_code
(30,)
[51 39 23 16 55 21 11 41 31 20 32 28 78 12 50 36 2 18 49 42 19 72...
state_fips_code null values: 0
county_fips_code
(464,)
[51810. nan 51177. 39049. 23019. 23031. 51087. 51710. 23005. 39139....
county_fips_code null values: 11756
age_group
(4,)
['65+ years' '18 to 49 years' '50 to 64 years' '0 - 17 years']
age_group null values: 0
sex
(2,)
['Female' 'Male']
sex null values: 0
race
(6,)
['White' 'Black' 'Multiple/Other' 'Asian' 'American Indian/Alaska Native'
 'Native Hawaiian/Other Pacific Islander']
race null values: 0
ethnicity
(2,)
['Non-Hispanic/Latino' 'Hispanic/Latino']
ethnicity null values: 0
process
(9,)
['Routine surveillance' 'Laboratory reported' 'Provider reported'
 'Clinical evaluation' 'Multiple' 'Other'
 'Contact tracing of case patient' 'Routine physical examination'
 'Autopsy']
process null values: 0
exposure_yn
(1,)
['Yes']
exposure_yn nul

In [37]:
for col in numerical_cols:
    print(col)
    print_na_count(df, col)

case_positive_specimen_interval
case_positive_specimen_interval null values: 109308
case_onset_interval
case_onset_interval null values: 13813


County fips code seems to be the only categorical column with nulls.\
I may possibly remove case_positive_specimen_interval, case_onset_interval...\
These are calculated weeks difference from pos_spec_dt and onset_dt respectively...\
maybe making them zero if null makes sense since the record wouldnt exist without knowing they have covid...

In [38]:
categorical_cols = [x for x in categorical_cols if x != 'county_fips_code']
df = df.drop(['county_fips_code'], axis=1)
categorical_cols

['case_month',
 'state_fips_code',
 'age_group',
 'sex',
 'race',
 'ethnicity',
 'process',
 'exposure_yn',
 'current_status',
 'symptom_status',
 'hosp_yn',
 'icu_yn',
 'underlying_conditions_yn']

In [39]:
df[numerical_cols] = df[numerical_cols].fillna(0)

## One Hot Encoding

In [40]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [41]:
np.random.seed(777)

In [42]:
encoder = OneHotEncoder(sparse_output=False, drop='if_binary')
ct = ColumnTransformer(
    transformers=[
        ('onehot', encoder, categorical_cols)
    ],
    remainder='passthrough'  # Keep other columns as is
)
transformed_array = ct.fit_transform(df)
print(transformed_array.shape)

(175407, 112)


categorical columns are now 533 one hot columns

In [43]:
onehot_features = ct.named_transformers_['onehot'].get_feature_names_out(categorical_cols)
numeric_features = df.columns.difference(categorical_cols)
all_features = np.concatenate([onehot_features, numeric_features])

print(f'onehot {onehot_features.shape} + numeric {numeric_features.shape} = all {all_features.shape}')
print('random one hot features:')
print(onehot_features[np.random.randint(0, onehot_features.shape[0], size=(10))])

onehot (109,) + numeric (3,) = all (112,)
random one hot features:
['exposure_yn_Yes' 'case_month_2024-02' 'state_fips_code_19'
 'case_month_2023-05' 'race_American Indian/Alaska Native'
 'case_month_2024-04' 'state_fips_code_41' 'case_month_2022-08'
 'case_month_2023-06' 'race_Black']


In [44]:
transformed_df = pd.DataFrame(
    transformed_array,
    columns=all_features
)

## Model Training

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Define features, target, and train/test split

In [46]:
X = transformed_df[transformed_df.columns.difference(['death_yn'])] 
y = transformed_df['death_yn'].map({'Yes': 1, 'No': 0})
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [47]:
columns_with_null = df.columns[df.isnull().any()]
columns_with_null

Index([], dtype='object')

In [48]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [49]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [50]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [51]:
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [52]:
print(accuracy)
print(class_report)
print(conf_matrix)

0.9815859985177584
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     34409
           1       0.57      0.17      0.26       673

    accuracy                           0.98     35082
   macro avg       0.78      0.58      0.63     35082
weighted avg       0.98      0.98      0.98     35082

[[34321    88]
 [  558   115]]


These score results look iffy...\
Probably need to use some type of cross validation testing

In [65]:
lives = y_train[y_train == 0].shape[0]
dies = y_train[y_train == 1].shape[0]

print(f"live count {lives}, death count {dies}")
print(f"{lives/dies}")

live count 137833, death count 2492
55.31019261637239


imbalanced target category representation!\
lets look at whole dataset not just training...

In [66]:
lives = df[df['death_yn'] == 'No'].shape[0]
dies = df[df['death_yn'] == 'Yes'].shape[0]

print(f"live count {lives}, death count {dies}")
print(f"{lives/dies}")

live count 172242, death count 3165
54.42085308056872


Train/test and evaluating model needs stratifcation split...\
Confusion matrix tells me that flase negatives for Death is a problem.\
Model does relatively well with predicting Living