In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KernelDensity
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
df = pd.read_csv('../data/clean_data/merged_df.csv')

In [4]:


# Features and target variable
X = df[["Avg_temp_june_value", "Min_temp_june_value", "Precipitation_june_value", "Avg_cooling_degree_days_june"]]
y = df["total_lyme_disease_counts"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_preds = rf_model.predict(X_test_scaled)

# 2. Gradient Boosted Trees
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_scaled, y_train)
gb_preds = gb_model.predict(X_test_scaled)

# 3. K-Nearest Neighbors
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_preds = knn_model.predict(X_test_scaled)

# 4. Decision Trees
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train_scaled, y_train)
dt_preds = dt_model.predict(X_test_scaled)

# 5. Bayesian KDE for regression-like task
kde = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(X_train_scaled)
kde_density = kde.score_samples(X_test_scaled)  # Log-density estimates
kde_preds = np.exp(kde_density)  # Convert log-density to probabilities for comparison

# 6. PyTorch Linear Regression
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Model initialization
input_dim = X_train_tensor.shape[1]
lr_model = LinearRegressionModel(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(lr_model.parameters(), lr=0.01)

# Training
epochs = 100
for epoch in range(epochs):
    lr_model.train()
    optimizer.zero_grad()
    outputs = lr_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# Predictions
lr_model.eval()
with torch.no_grad():
    lr_preds = lr_model(X_test_tensor).numpy().flatten()

# Evaluation
def evaluate_model(y_true, y_preds, model_name):
    rmse = mean_squared_error(y_true, y_preds, squared=False)
    r2 = r2_score(y_true, y_preds)
    print(f"{model_name} - RMSE: {rmse}, R2 Score: {r2}")

evaluate_model(y_test, rf_preds, "Random Forest")
evaluate_model(y_test, gb_preds, "Gradient Boosted Trees")
evaluate_model(y_test, knn_preds, "K-Nearest Neighbors")
evaluate_model(y_test, dt_preds, "Decision Trees")
evaluate_model(y_test, kde_preds, "KDE (Regression)")
evaluate_model(y_test_tensor.numpy().flatten(), lr_preds, "Linear Regression (PyTorch)")


Random Forest - RMSE: 55811.91926540627, R2 Score: -0.12587842411459493
Gradient Boosted Trees - RMSE: 55035.11459427612, R2 Score: -0.09475599498089671
K-Nearest Neighbors - RMSE: 70744.48544472622, R2 Score: -0.8089341852760377
Decision Trees - RMSE: 50935.17759292334, R2 Score: 0.06227988669453488
KDE (Regression) - RMSE: 195503.4882239393, R2 Score: -12.814863657048273
Linear Regression (PyTorch) - RMSE: 195502.203125, R2 Score: -12.81468234240567




None of these models are very strong at all, but the best performing one appears to be the Decision Tree. Thus, Decision Tree will be used to perform predictions for the number of Lyme disease cases in the year 2042, under RCP 8.5's predicted average temperature, minimum temperature, and average precipitation in the month of June in that year. 

In [None]:
# Example new data for 2042
new_data = pd.DataFrame({
    "Avg_temp_june_value": [75.0],  # Replace with projected value for 2042
    "Min_temp_june_value": [60.0],  # Replace with projected value
    "Precipitation_june_value": [3.5],  # Replace with projected value
    "Avg_cooling_degree_days_june": [10.0]  # Replace with projected value
})

# Scale the new data using the same scaler used for training
new_data_scaled = scaler.transform(new_data)

# Predict Lyme disease counts for 2042
predicted_counts_2042 = dt_model.predict(new_data_scaled)
print("Predicted total Lyme disease counts in 2042:", predicted_counts_2042)
