# K-Nearest Neighbors

**Split data into training and testing sets and scale data**

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, accuracy_score, \
                            recall_score, f1_score

df = pd.read_csv('preprocessed_data.csv')

# Split data
X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine'])
# Predicting on two values --> Two models
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

# Training and testing sets (20% testing)
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size= 0.2,random_state=123)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size= 0.2,random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train Model**

In [2]:
# Find best parameter choice for k based on accuracy (for now)
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train,y_train_h1n1)

# K for h1n1 vaccine prediction
best_pred_h1n1 = [-1, -1]
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i, weights = 'distance', p=2)
    knn.fit(X_train, y_train_h1n1)
    score = accuracy_score(y_test_h1n1, knn.predict(X_test))
    if score > best_pred_h1n1[1]:
        best_pred_h1n1 = [i,score]
# best_pred_h1n1

# K for seasonal vaccine prediction
best_pred_seasonal = [-1, -1]
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i, weights = 'distance', p=2)
    knn.fit(X_train, y_train_seasonal)
    score = accuracy_score(y_test_seasonal, knn.predict(X_test))
    if score > best_pred_seasonal[1]:
        best_pred_seasonal = [i,score]
# best_pred_seasonal

In [3]:
# Train and fit model
# H1n1 prediction
knn=KNeighborsClassifier(n_neighbors=best_pred_h1n1[0], weights='distance',p=2)
knn.fit(X_train, y_train_h1n1)
# Prediction
y_pred_h1n1 = knn.predict(X_test)
# Retrain model for seasonal vaccination prediction
knn=KNeighborsClassifier(n_neighbors=best_pred_seasonal[0], weights='distance',p=2)
knn.fit(X_train, y_train_seasonal)
y_pred_seasonal = knn.predict(X_test)

**Model Evaluation**

In [4]:
# Metrics for H1N1 Prediction
accuracy = accuracy_score(y_test_h1n1, y_pred_h1n1)
precision = precision_score(y_test_h1n1, y_pred_h1n1)
recall = recall_score(y_test_h1n1, y_pred_h1n1)
f1 = f1_score(y_test_h1n1, y_pred_h1n1)
print("Prediction for H1N1 Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for H1N1 Vaccination Status

Accuracy: 0.8176712841632348
Precision: 0.655536028119508
Recall: 0.32406602953953084
F1 Score: 0.43372093023255814


In [5]:
# Metrics for Seasonal Prediction
accuracy = accuracy_score(y_test_seasonal, y_pred_seasonal)
precision = precision_score(y_test_seasonal, y_pred_seasonal)
recall = recall_score(y_test_seasonal, y_pred_seasonal)
f1 = f1_score(y_test_seasonal, y_pred_seasonal)
print("Prediction for Seasonal Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for Seasonal Vaccination Status

Accuracy: 0.7502807937102209
Precision: 0.7464013547840813
Recall: 0.7057646116893515
F1 Score: 0.7255144032921811


# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('preprocessed_data.csv')

# Split data
X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine'])
# Predicting on two values --> Two models
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

# Training and testing sets (20% testing)
# Random state ensures splits are the same for both predictions
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size= 0.2,random_state=123)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size= 0.2,random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train Model**

In [7]:
# Train and fit model
# H1n1 prediction
rf = RandomForestClassifier(n_estimators=150)
rf.fit(X_train,y_train_h1n1)
y_pred_h1n1 = rf.predict(X_test)

# Seasonal prediction
rf.fit(X_train,y_train_seasonal)
y_pred_seasonal = rf.predict(X_test)

**Model Evaluation**

In [8]:
# Metrics for H1N1 Prediction
accuracy = accuracy_score(y_test_h1n1, y_pred_h1n1)
precision = precision_score(y_test_h1n1, y_pred_h1n1)
recall = recall_score(y_test_h1n1, y_pred_h1n1)
f1 = f1_score(y_test_h1n1, y_pred_h1n1)
print("Prediction for H1N1 Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for H1N1 Vaccination Status

Accuracy: 0.8283414451516286
Precision: 0.6805555555555556
Recall: 0.3831450912250217
F1 Score: 0.490272373540856


In [9]:
# Metrics for Seasonal Prediction
accuracy = accuracy_score(y_test_seasonal, y_pred_seasonal)
precision = precision_score(y_test_seasonal, y_pred_seasonal)
recall = recall_score(y_test_seasonal, y_pred_seasonal)
f1 = f1_score(y_test_seasonal, y_pred_seasonal)
print("Prediction for Seasonal Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for Seasonal Vaccination Status

Accuracy: 0.7742418569824036
Precision: 0.7691666666666667
Recall: 0.7389911929543634
F1 Score: 0.7537770518579012
