## Supervised Learning Methods & Models 

In this section, I will be using various supervised learning models to predict what the counts of Lymes disease will be in each region in the year 2050 under the RCP 8.5 Climate Scenario. I will compare the accuracy of the trained models to determine the optimal model, and perform my final prediction with that model. 

In [1]:
import pandas as pd
df = pd.read_csv('../data/clean_data/state_data.csv')

# Pivot the data to make years columns for easier calculations
pivot_df = df.pivot(index='State', columns='Year', values='Tree_Cover_Loss')

# Calculate percent change from 2008 to 2012
pivot_df['percent_change'] = ((pivot_df[2022] - pivot_df[2008]) / pivot_df[2008]) * 100

# Reset the index to get a clean DataFrame
result_df = pivot_df[['percent_change']].reset_index()

# Display the result
print(result_df)

Year                 State  percent_change
0                  Alabama      -45.606954
1                   Alaska      547.065173
2                  Arizona       18.955399
3                 Arkansas      -39.249564
4               California      -20.236917
5                 Colorado      -80.585406
6              Connecticut       28.540066
7                 Delaware       20.553360
8     District of Columbia     -100.000000
9                  Florida      -55.580284
10                 Georgia      -33.691634
11                  Hawaii      -39.123103
12                   Idaho      -32.367426
13                Illinois       85.040650
14                 Indiana       74.310181
15                    Iowa      133.971292
16                  Kansas      -50.869565
17                Kentucky      -43.822359
18               Louisiana      -49.443878
19                   Maine       -8.732969
20                Maryland        7.752577
21           Massachusetts       20.172786
22         

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/clean_data/state_data.csv')

# Define a function to calculate the slope for each state
def calculate_slope(sub_df):
    x = sub_df['Year']
    y = sub_df['Tree_Cover_Loss']
    slope, _ = np.polyfit(x, y, 1)  # Linear regression: degree=1
    return slope

# Group by state and calculate the slope
slopes = df.groupby('State').apply(calculate_slope).reset_index()
slopes.columns = ['State', 'slope']

# Display the result
print(slopes)


                   State        slope
0                Alabama -3928.214286
1                 Alaska -1159.689286
2                Arizona  -357.400000
3               Arkansas -2809.800000
4             California  9426.257143
5               Colorado   774.757143
6            Connecticut    49.775000
7               Delaware     8.328571
8   District of Columbia    -0.278571
9                Florida -2553.421429
10               Georgia -3667.125000
11                Hawaii    21.453571
12                 Idaho  -331.335714
13              Illinois    29.692857
14               Indiana    61.657143
15                  Iowa    40.632143
16                Kansas    -2.478571
17              Kentucky  -347.485714
18             Louisiana -3480.146429
19                 Maine   881.085714
20              Maryland    29.514286
21         Massachusetts   130.932143
22              Michigan   660.271429
23             Minnesota   111.739286
24           Mississippi -3001.267857
25          

  slopes = df.groupby('State').apply(calculate_slope).reset_index()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
df = pd.read_csv('../data/clean_data/state_data.csv')

In [None]:

# Data preparation
def prepare_modeling_data(df, changes_df):
    # Future prediction dataset
    future_data = pd.DataFrame()
    
    # Add your RCP 8.5 climate projections for 2042
    # future_data['Precipitation_avg'] = df[df['Year'] == 2022]['Precipitation_avg']
    # future_data['Min_temp_avg'] = df[df['Year'] == 2022]['Min_temp_avg'] + 2  # Example increment
    # future_data['Avg_temp'] = df[df['Year'] == 2022]['Avg_temp'] + 2  # Example increment
    
    # Specify Total Land Area, as a constant feature
    future_data['Total_Land_Area'] = df[df['Year'] == 2022]['Total_Land_Area']
    
    # Add the calculated percent changes
    future_data = future_data.join(changes_df[['tree_cover_change_pct', 'species_richness_change_pct']])
    
    return future_data

# Train different models and evaluate them
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Random Forest': RandomForestRegressor(n_estimators=100),
        'SVR': SVR(kernel='rbf')
    }
    
    results = {}
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'MSE': mse,
            'R2': r2,
            'Model': model
        }
    
    return results

# Main execution
# Load your data
df = pd.read_csv('../data/clean_data/state_data.csv')

# Print initial info about missing values
print("Missing values before imputation:")
print(df.isnull().sum())

# Calculate percent changes
changes_df = calculate_percent_changes(df)

# Prepare features
features = ['Precipitation_avg', 'Min_temp_avg', 'Avg_temp', 'Total_Land_Area', 
           'Tree_Cover_Loss', 'species_richness']
X = df[features].values
y = df['Lyme_cases'].values

# Handle missing values using imputation
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate models
results = train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test)

# Print results
print("\nModel Performance Comparison:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"MSE: {metrics['MSE']:.2f}")
    print(f"R2 Score: {metrics['R2']:.2f}")

# Prepare future prediction data
future_data = prepare_modeling_data(df, changes_df)

# Impute any missing values in future data
future_data = pd.DataFrame(imputer.transform(future_data), columns=future_data.columns)

# Make predictions for 2042
print("\nPredictions for 2042:")
for name, metrics in results.items():
    future_pred = metrics['Model'].predict(scaler.transform(future_data))
    print(f"\n{name} prediction:")
    print(f"Predicted Lyme cases: {future_pred.mean():.0f}")

Missing values before imputation:
Year                  0
Precipitation_avg     0
region                0
Min_temp_avg          0
Avg_temp              0
State                 0
Lyme_cases            0
Total_Land_Area       0
Tree_Cover_Loss       0
species_richness     45
dtype: int64

Model Performance Comparison:

Linear Regression:
MSE: 1452510.92
R2 Score: 0.30

Ridge Regression:
MSE: 1424841.94
R2 Score: 0.32

Lasso Regression:
MSE: 1442448.05
R2 Score: 0.31

Random Forest:
MSE: 794366.63
R2 Score: 0.62

SVR:
MSE: 2559671.87
R2 Score: -0.23

Predictions for 2042:

Linear Regression prediction:
Predicted Lyme cases: 510

Ridge Regression prediction:
Predicted Lyme cases: 464

Lasso Regression prediction:
Predicted Lyme cases: 498

Random Forest prediction:
Predicted Lyme cases: 1213

SVR prediction:
Predicted Lyme cases: 75


