In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import eda, ml, data_cleaning,constants,pandas as pd, numpy as np, geopandas as gpd, warnings, textwrap; warnings.filterwarnings("ignore")

In [32]:
# Calculate the mean value of the target variable
mean_target = np.mean(y_train)

# Create a baseline predictions array with the mean value repeated for the length of the test set
y_baseline = np.full_like(y_test, fill_value=mean_target)

# Calculate the mean squared error for the baseline model
mse_baseline = mean_squared_error(y_test, y_baseline)
print("Mean Squared Error (Baseline - Mean Prediction):", mse_baseline)

Mean Squared Error (Baseline - Mean Prediction): 6.226041012195832


In [33]:
from sklearn.tree import DecisionTreeRegressor

# Initializing and training the Decision Tree Regressor model
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

# Predicting on the test set
y_pred = tree_model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Decision Trees):", mse)

Mean Squared Error (Decision Trees): 13.741316951922567


# Decision Tree Regressor vs Baseline

1. Decision Tree Regressor's MSE of 13.741 indicates substantial prediction errors.
2. Baseline MSE of 6.226 shows the model's performance is worse than a simple mean predictor.
3. Possible overfitting is suggested by the high MSE of the baseline model, might need for model refinement.

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report

# Initializing and training the Random Forest classifier
rf_classifier = RandomForestRegressor(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Making predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
mse_random = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Random Forest):", mse_random)

Mean Squared Error (Random Forest): 4.296747682439488


In [37]:
ebird_gdf=pd.read_csv('data/final_dataset.tsv', sep='\t')
ebird_gdf = ebird_gdf[(ebird_gdf["OBSERVATION YEAR"] >= 2014) & (ebird_gdf["OBSERVATION YEAR"] <= 2023)]

### 3.1 Aggregate data by month and community

In [38]:
final_df = eda.aggregate_data(ebird_gdf,["OBSERVATION MONTH", "OBSERVATION YEAR", "community","COMMON NAME"])
final_df["COUNT"] =  final_df["COUNT"].apply(lambda x: sum(x))
final_df = eda.aggregate_data(final_df,["OBSERVATION MONTH", "OBSERVATION YEAR", "community"])
final_df["shannon_index"] = final_df["COUNT"].apply(eda.shannon_index)

In [39]:
community_counts = final_df['community'].value_counts()
communities_to_remove = community_counts[community_counts == 1].index.tolist()
final_df = final_df[~final_df['community'].isin(communities_to_remove)]

### 3.2 Set features and output for ML analysis

In [40]:
features = ['OBSERVATION MONTH', 'OBSERVATION YEAR', 'community']
target = 'shannon_index'
X = final_df[features]
y = final_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
ml_results = pd.DataFrame(columns=["Algorithm","MSE","RMSE","MAE","R^2"])

### 3.3 Run ML models and Store the Metrics

In [41]:
def run_model(str):
    model = ml.MLFactory.get_instance(str)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse,rmse,mae,r2 = model.calulate_metrics(y_test,y_pred)
    ml_results.loc[len(ml_results.index)] = [str, mse,rmse,mae,r2]

In [42]:
# Baseline with Mean as the prediction value
run_model("Baseline")

In [43]:
# Decision Tree Regressor
run_model("DecisionTree")

In [44]:
# Random forest regressor
run_model("RandomForest")

In [45]:
# Support Vector regressor Linear Kernel
run_model("SVR")

In [46]:
# Gradient Boosting Regressor
run_model("GradientBoosting")

In [47]:
# MLPRegressor(Neural Network) with hidden_layer_sizes=(100, 50) and activation function activation='relu'
run_model('NeuralNetwork')

In [48]:
ml_results

Unnamed: 0,Algorithm,MSE,RMSE,MAE,R^2
0,Baseline,0.717444,0.84702,0.680178,-0.001866
1,DecisionTree,0.533931,0.730706,0.498759,0.254399
2,RandomForest,0.343841,0.58638,0.408073,0.519847
3,SVR,0.720204,0.848648,0.673786,-0.00572
4,GradientBoosting,0.398172,0.631009,0.479947,0.443977
5,NeuralNetwork,0.438946,0.66253,0.533919,0.387039


# Random forest regressor vs baseline

1. Random Forest leverages ensemble learning, combining multiple decision trees for improved accuracy.
2. It captures non-linear relationships in data, unlike the Baseline, which predicts a constant mean.
3. Random Forest automatically assesses feature importance, focusing on relevant predictors, reducing prediction errors.

In [27]:
# features = ['PERCENT OF HOUSING CROWDED', 
#             'PERCENT HOUSEHOLDS BELOW POVERTY', 
#             'PERCENT AGED 16+ UNEMPLOYED', 
#             'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA', 
#             'PERCENT AGED UNDER 18 OR OVER 64',
#             'HARDSHIP INDEX',
#             'PER CAPITA INCOME']
#Mean Squared Error (Decision Trees): 14.384432122771436
#Mean Squared Error (KNN): 4.139463044176787
#Mean Squared Error (Linear Regression): 5.303329023877021
#Mean Squared Error (Random Forest): 4.368482085574034

In [28]:
# features = ['PERCENT OF HOUSING CROWDED', 
#             'PERCENT HOUSEHOLDS BELOW POVERTY', 
#             'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA', 
#             'PERCENT AGED UNDER 18 OR OVER 64',
#             'HARDSHIP INDEX',
#             'PER CAPITA INCOME']
#Mean Squared Error (Decision Trees): 12.213353087465187
#Mean Squared Error (KNN): 4.117839844458724
#Mean Squared Error (Linear Regression): 4.763298189159179
#Mean Squared Error (Random Forest): 4.2496426590700525

In [29]:
# features = ['PERCENT OF HOUSING CROWDED', 
#             'PERCENT HOUSEHOLDS BELOW POVERTY', 
#             'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA', 
#             'HARDSHIP INDEX',
#             'PER CAPITA INCOME']
#Mean Squared Error (Decision Trees): 7.32619548931322
#Mean Squared Error (KNN): 4.9332521262880284
#Mean Squared Error (Linear Regression): 4.4638652250276145
#Mean Squared Error (Random Forest): 4.830146615875486

In [30]:
# features = ['PERCENT OF HOUSING CROWDED', 
#             'PERCENT HOUSEHOLDS BELOW POVERTY', 
#             'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA', 
#             'HARDSHIP INDEX',
#             'PER CAPITA INCOME']
#Mean Squared Error (Decision Trees): 13.741316951922567
#Mean Squared Error (KNN): 4.415633525599799
#Mean Squared Error (Linear Regression): 4.346668043530402
#Mean Squared Error (Random Forest): 4.290840329025155