# K-Nearest Neighbors

**Split data into training and testing sets and scale data**

In [15]:
import pandas as pd
import numpy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, accuracy_score, \
                            recall_score, f1_score

df = pd.read_csv('preprocessed_data.csv')

# Split data
X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine','respondent_id','Unnamed: 0'])
# Predicting on two values --> Two models
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

# Training and testing sets (20% testing)
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size= 0.2,random_state=123)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size= 0.2,random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train Model**

In [16]:
# Find best parameter choice for k based on accuracy (for now)
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train,y_train_h1n1)

# K for h1n1 vaccine prediction
best_pred_h1n1 = [-1, -1]
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i, weights = 'distance', p=2)
    knn.fit(X_train, y_train_h1n1)
    score = accuracy_score(y_test_h1n1, knn.predict(X_test))
    if score > best_pred_h1n1[1]:
        best_pred_h1n1 = [i,score]
# best_pred_h1n1

# K for seasonal vaccine prediction
best_pred_seasonal = [-1, -1]
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i, weights = 'distance', p=2)
    knn.fit(X_train, y_train_seasonal)
    score = accuracy_score(y_test_seasonal, knn.predict(X_test))
    if score > best_pred_seasonal[1]:
        best_pred_seasonal = [i,score]
# best_pred_seasonal

In [17]:
# Train and fit model
# H1n1 prediction
knn=KNeighborsClassifier(n_neighbors=best_pred_h1n1[0], weights='distance',p=2)
knn.fit(X_train, y_train_h1n1)
# Prediction
y_pred_h1n1 = knn.predict(X_test)
# Retrain model for seasonal vaccination prediction
knn=KNeighborsClassifier(n_neighbors=best_pred_seasonal[0], weights='distance',p=2)
knn.fit(X_train, y_train_seasonal)
y_pred_seasonal = knn.predict(X_test)

**Model Evaluation**

In [18]:
# Metrics for H1N1 Prediction
accuracy = accuracy_score(y_test_h1n1, y_pred_h1n1)
precision = precision_score(y_test_h1n1, y_pred_h1n1)
recall = recall_score(y_test_h1n1, y_pred_h1n1)
f1 = f1_score(y_test_h1n1, y_pred_h1n1)
print("Prediction for H1N1 Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for H1N1 Vaccination Status

Accuracy: 0.817109696742793
Precision: 0.6464646464646465
Recall: 0.3336229365768897
F1 Score: 0.44011461318051576


In [19]:
# Metrics for Seasonal Prediction
accuracy = accuracy_score(y_test_seasonal, y_pred_seasonal)
precision = precision_score(y_test_seasonal, y_pred_seasonal)
recall = recall_score(y_test_seasonal, y_pred_seasonal)
f1 = f1_score(y_test_seasonal, y_pred_seasonal)
print("Prediction for Seasonal Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for Seasonal Vaccination Status

Accuracy: 0.7523399475851741
Precision: 0.7498936622713739
Recall: 0.7057646116893515
F1 Score: 0.7271602392245824


# Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('preprocessed_data.csv')

# Split data
X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine','respondent_id','Unnamed: 0'])
# Predicting on two values --> Two models
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

# Training and testing sets (20% testing)
# Random state ensures splits are the same for both predictions
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size= 0.2,random_state=123)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size= 0.2,random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train Model**

In [34]:
# Train and fit model
# H1n1 prediction
rf = RandomForestClassifier(n_estimators=150)
rf.fit(X_train,y_train_h1n1)
y_pred_h1n1 = rf.predict(X_test)
h1n1_importances=rf.feature_importances_

# Seasonal prediction
rf.fit(X_train,y_train_seasonal)
y_pred_seasonal = rf.predict(X_test)
seasonal_importances=rf.feature_importances_

**Model Evaluation**

In [22]:
# Metrics for H1N1 Prediction
accuracy = accuracy_score(y_test_h1n1, y_pred_h1n1)
precision = precision_score(y_test_h1n1, y_pred_h1n1)
recall = recall_score(y_test_h1n1, y_pred_h1n1)
f1 = f1_score(y_test_h1n1, y_pred_h1n1)
print("Prediction for H1N1 Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for H1N1 Vaccination Status

Accuracy: 0.8275926619243729
Precision: 0.6785714285714286
Recall: 0.3796698523023458
F1 Score: 0.486908077994429


In [23]:
# Metrics for Seasonal Prediction
accuracy = accuracy_score(y_test_seasonal, y_pred_seasonal)
precision = precision_score(y_test_seasonal, y_pred_seasonal)
recall = recall_score(y_test_seasonal, y_pred_seasonal)
f1 = f1_score(y_test_seasonal, y_pred_seasonal)
print("Prediction for Seasonal Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for Seasonal Vaccination Status

Accuracy: 0.7736802695619618
Precision: 0.768429820907955
Recall: 0.7385908726981585
F1 Score: 0.7532149418248621


**Extract Importances**

In [35]:

# Sort feature importances in descending order for h1n1 vaccines
sorted_indices = numpy.argsort(h1n1_importances)[::-1]
feature_names = X.columns  # This should already exclude 'respondent_id' and 'Unnamed: 0'

# Print feature importances
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {h1n1_importances[idx]}")


doctor_recc_h1n1: 0.0951423069508681
opinion_h1n1_risk: 0.07846371510270557
hhs_geo_region: 0.06953994218777287
opinion_h1n1_vacc_effective: 0.06157714596539979
opinion_seas_risk: 0.047504074536084934
age_group: 0.0456223983189875
opinion_h1n1_sick_from_vacc: 0.03919853440566445
education: 0.037352841577579586
h1n1_concern: 0.03726417135298778
opinion_seas_sick_from_vacc: 0.035909620871625136
census_msa: 0.03464137589743387
opinion_seas_vacc_effective: 0.03361219049784958
household_adults: 0.03128649999269585
income_poverty: 0.027481490099766012
h1n1_knowledge: 0.027314757640672642
household_children: 0.026855638738250472
employment_status: 0.024303104158197822
doctor_recc_seasonal: 0.021836441126064875
race: 0.021661131408704147
sex: 0.02103760786531477
health_worker: 0.020913013622256525
chronic_med_condition: 0.01831509276621082
behavioral_large_gatherings: 0.018058651250584062
marital_status: 0.01800722796679404
behavioral_outside_home: 0.017636425384427033
behavioral_touch_face: 0

In [36]:
# Sort feature importances in descending order for seasonal vaccines 
sorted_indices = numpy.argsort(seasonal_importances)[::-1]
feature_names = X.columns  # This should already exclude 'respondent_id' and 'Unnamed: 0'

# Print feature importances
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {seasonal_importances[idx]}")


opinion_seas_vacc_effective: 0.09660427230291846
opinion_seas_risk: 0.09561661589784846
age_group: 0.07303491393644465
doctor_recc_seasonal: 0.06576078204694917
hhs_geo_region: 0.06254103281902536
opinion_h1n1_risk: 0.040649014909119104
opinion_h1n1_vacc_effective: 0.038632120291360525
opinion_seas_sick_from_vacc: 0.03629628730340781
education: 0.03566089181690585
h1n1_concern: 0.034517557324170445
opinion_h1n1_sick_from_vacc: 0.034417790402008444
census_msa: 0.03143233945086696
household_adults: 0.028850568508232782
h1n1_knowledge: 0.027957017550395393
income_poverty: 0.026161232022970847
household_children: 0.024805474015431133
employment_status: 0.022884213440099832
race: 0.022324955942162238
sex: 0.019304727837331892
chronic_med_condition: 0.01792805550177738
doctor_recc_h1n1: 0.01735221270378153
marital_status: 0.017209798509492936
behavioral_touch_face: 0.016956334224971132
behavioral_large_gatherings: 0.016262213606014243
behavioral_outside_home: 0.01611906175871104
behavioral_a

**Feature Dropping**

In [25]:
df = pd.read_csv('preprocessed_data.csv')

In [26]:

def modeling_Feature_Importance(X,y,model):
    # Split data
    X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine','respondent_id','Unnamed: 0'])
    
   
    # Initialize a dictionary to store the accuracy for each feature removed
    accuracy_with_feature_removed = {}

    # Iterate over each feature to remove it and evaluate the model performance without it
    for column in X.columns:
        # Drop the current feature
        X_temp = X.drop(column, axis=1)

        # Split the modified dataset
        X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X_temp, y, test_size=0.2, random_state=42)

        # Scale the data
        scaler_temp = StandardScaler()
        X_train_temp_scaled = scaler_temp.fit_transform(X_train_temp)
        X_test_temp_scaled = scaler_temp.transform(X_test_temp)

        # Train and evaluate the KNN model without the current feature
        if (model== "knn"):
            model=KNeighborsClassifier(n_neighbors=5)
            
        else:
            model=RandomForestClassifier()
        model.fit(X_train_temp_scaled, y_train_temp)
        predictions_temp = model.predict(X_test_temp_scaled)
        accuracy_temp = accuracy_score(y_test_temp, predictions_temp)

        # Store the accuracy in the dictionary
        accuracy_with_feature_removed[column] = accuracy_temp

    # Convert the dictionary to a DataFrame for easier analysis and sort by accuracy
    return pd.DataFrame(list(accuracy_with_feature_removed.items()), columns=['Feature Removed', 'Accuracy']).sort_values(by='Accuracy', ascending=False)






In [27]:
#Note higher accuracy means less useful feature 

In [28]:
modeling_Feature_Importance(df,df['h1n1_vaccine'], "knn") #h1h1, knn

Unnamed: 0,Feature Removed,Accuracy
27,employment_status,0.838076
3,behavioral_avoidance,0.837888
4,behavioral_face_mask,0.837888
2,behavioral_antiviral_meds,0.837514
17,opinion_seas_vacc_effective,0.836952
28,hhs_geo_region,0.836952
22,race,0.836765
13,health_worker,0.836391
25,marital_status,0.836204
6,behavioral_large_gatherings,0.836016


In [29]:
modeling_Feature_Importance(df,df['seasonal_vaccine'], "knn") #seasonal, vaccine knn

Unnamed: 0,Feature Removed,Accuracy
5,behavioral_wash_hands,0.781168
12,child_under_6_months,0.778735
16,opinion_h1n1_sick_from_vacc,0.778547
26,rent_or_own,0.778173
27,employment_status,0.777986
22,race,0.777799
14,opinion_h1n1_vacc_effective,0.777611
7,behavioral_outside_home,0.777611
19,opinion_seas_sick_from_vacc,0.777611
4,behavioral_face_mask,0.777237


In [30]:
modeling_Feature_Importance(df,df['h1n1_vaccine'], "random") #h1n1, random forest

Unnamed: 0,Feature Removed,Accuracy
7,behavioral_outside_home,0.839386
24,income_poverty,0.839386
3,behavioral_avoidance,0.839199
5,behavioral_wash_hands,0.838076
12,child_under_6_months,0.837514
22,race,0.83714
4,behavioral_face_mask,0.836952
6,behavioral_large_gatherings,0.836952
29,census_msa,0.836952
23,sex,0.836391


In [32]:
modeling_Feature_Importance(df,df['seasonal_vaccine'], "random") #seasonal, random forest

Unnamed: 0,Feature Removed,Accuracy
2,behavioral_antiviral_meds,0.779858
3,behavioral_avoidance,0.779296
30,household_adults,0.779296
27,employment_status,0.779109
0,h1n1_concern,0.778922
9,doctor_recc_h1n1,0.778173
16,opinion_h1n1_sick_from_vacc,0.777986
26,rent_or_own,0.777799
6,behavioral_large_gatherings,0.777611
25,marital_status,0.777611
