# K-Nearest Neighbors

**Split data into training and testing sets and scale data**

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, accuracy_score, \
                            recall_score, f1_score

df = pd.read_csv('preprocessed_data.csv')

# Split data
X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine'])
# Predicting on two values --> Two models
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

# Training and testing sets (20% testing)
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size= 0.2,random_state=123)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size= 0.2,random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train Model**

In [2]:
# Train and fit model
# H1n1 prediction
knn=KNeighborsClassifier(n_neighbors=5, weights='distance',p=2)
knn.fit(X_train, y_train_h1n1)
# Prediction
y_pred_h1n1 = knn.predict(X_test)
# Retrain model for seasonal vaccination prediction
knn=KNeighborsClassifier(n_neighbors=5, weights='distance',p=2)
knn.fit(X_train, y_train_seasonal)
y_pred_seasonal = knn.predict(X_test)

**Model Evaluation**

In [3]:
# Metrics for H1N1 Prediction
accuracy = accuracy_score(y_test_h1n1, y_pred_h1n1)
precision = precision_score(y_test_h1n1, y_pred_h1n1)
recall = recall_score(y_test_h1n1, y_pred_h1n1)
f1 = f1_score(y_test_h1n1, y_pred_h1n1)
print("Prediction for H1N1 Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for H1N1 Vaccination Status

Accuracy: 0.7998876825159117
Precision: 0.5545212765957447
Recall: 0.3622936576889661
F1 Score: 0.43825538623226484


In [4]:
# Metrics for Seasonal Prediction
accuracy = accuracy_score(y_test_seasonal, y_pred_seasonal)
precision = precision_score(y_test_seasonal, y_pred_seasonal)
recall = recall_score(y_test_seasonal, y_pred_seasonal)
f1 = f1_score(y_test_seasonal, y_pred_seasonal)
print("Prediction for Seasonal Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for Seasonal Vaccination Status

Accuracy: 0.7135904155746912
Precision: 0.699341021416804
Recall: 0.6797437950360288
F1 Score: 0.6894031668696712


# Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('preprocessed_data.csv')

# Split data
X = df.drop(columns=['h1n1_vaccine','seasonal_vaccine'])
# Predicting on two values --> Two models
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

# Training and testing sets (20% testing)
# Random state ensures splits are the same for both predictions
X_train, X_test, y_train_h1n1, y_test_h1n1 = train_test_split(X, y_h1n1, test_size= 0.2,random_state=123)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X, y_seasonal, test_size= 0.2,random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train Model**

In [6]:
# Train and fit model
# H1n1 prediction
rf = RandomForestClassifier(n_estimators=150)
rf.fit(X_train,y_train_h1n1)
y_pred_h1n1 = rf.predict(X_test)

# Seasonal prediction
rf.fit(X_train,y_train_seasonal)
y_pred_seasonal = rf.predict(X_test)

**Model Evaluation**

In [7]:
# Metrics for H1N1 Prediction
accuracy = accuracy_score(y_test_h1n1, y_pred_h1n1)
precision = precision_score(y_test_h1n1, y_pred_h1n1)
recall = recall_score(y_test_h1n1, y_pred_h1n1)
f1 = f1_score(y_test_h1n1, y_pred_h1n1)
print("Prediction for H1N1 Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for H1N1 Vaccination Status

Accuracy: 0.8281542493448146
Precision: 0.6864
Recall: 0.37271937445699393
F1 Score: 0.4831081081081081


In [8]:
# Metrics for Seasonal Prediction
accuracy = accuracy_score(y_test_seasonal, y_pred_seasonal)
precision = precision_score(y_test_seasonal, y_pred_seasonal)
recall = recall_score(y_test_seasonal, y_pred_seasonal)
f1 = f1_score(y_test_seasonal, y_pred_seasonal)
print("Prediction for Seasonal Vaccination Status\n")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Prediction for Seasonal Vaccination Status

Accuracy: 0.7716211156870086
Precision: 0.7666944908180301
Recall: 0.7353883106485188
F1 Score: 0.7507151614221496
