In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load the data into a DataFrame
df = pd.read_csv('MLdf.csv')
df

Unnamed: 0,date,siteid,nitrate_mean,spec_conductance,DO,pH,chlorophyll,turbidity,fDOM,mean_temp,...,ln_nitrate_mean,ln_spec_conductance,ln_DO,ln_chlorophyll,ln_turbidity,ln_fDOM,ln_CH4_conc,ln_CO2_conc,ln_N2O_conc,ln_Microbialabundanceper_ml
0,2018-01-01,ARIK,13.700000,539.059444,10.186389,7.822500,2.095000,2.899444,45.220000,2.302477,...,2.617396,6.289826,2.321052,0.739554,1.064519,3.811539,3.250097,7.391244,-0.710933,10.871886
1,2018-01-02,ARIK,13.700000,539.059444,10.186389,7.822500,2.095000,2.899444,45.220000,2.302477,...,2.617396,6.289826,2.321052,0.739554,1.064519,3.811539,3.250097,7.391244,-0.710933,10.871886
2,2018-01-03,ARIK,13.700000,539.059444,10.186389,7.822500,2.095000,2.899444,45.220000,2.302477,...,2.617396,6.289826,2.321052,0.739554,1.064519,3.811539,3.250097,7.391244,-0.710933,10.871886
3,2018-01-04,ARIK,12.535417,546.480042,9.921785,7.812403,3.249465,60.369840,45.220000,2.302477,...,2.528558,6.303498,2.294733,1.178490,4.100490,3.811539,3.250097,7.391244,-0.710933,10.871886
4,2018-01-05,ARIK,10.310417,550.830208,9.721729,7.826743,5.565236,2.548007,45.220000,2.302477,...,2.333155,6.311427,2.274364,1.716539,0.935311,3.811539,3.250097,7.391244,-0.710933,10.871886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23441,2020-12-27,WLOU,0.001000,298.291892,11.899740,8.146028,0.220042,0.959097,16.215236,17.389790,...,-6.907755,5.698073,2.476517,-1.513938,-0.041763,2.785951,1.032457,6.691841,-0.320212,10.901399
23442,2020-12-28,WLOU,0.041758,12.105146,8.384253,6.651626,0.900684,2.639976,31.655028,10.173155,...,-3.175858,2.493631,2.126355,-0.104601,0.970770,3.454897,1.032457,6.691841,-0.320212,10.901399
23443,2020-12-29,WLOU,4.188542,613.853512,10.900608,7.869108,9.817500,0.600420,4.082917,4.325872,...,1.432353,6.419756,2.388819,2.284167,-0.510126,1.406812,1.032457,6.691841,-0.320212,10.901399
23444,2020-12-30,WLOU,54.432292,176.735399,14.561951,7.948806,1.642281,2.160620,142.774583,3.839380,...,3.996958,5.174654,2.678412,0.496086,0.770395,4.961267,1.032457,6.691841,-0.320212,10.901399


In [3]:
# Define the features (X) and label the target (y)
# data split for level-level
X = df.drop(['date','siteid','Microbialabundanceper_ml', 'ln_nitrate_mean', 'ln_spec_conductance', 'ln_DO', 'ln_chlorophyll', 
             'ln_turbidity', 'ln_fDOM', 'ln_CH4_conc', 'ln_CO2_conc', 'ln_N2O_conc', 'ln_Microbialabundanceper_ml'], axis=1)
y = df['Microbialabundanceper_ml']

#data split for log-level
X2 = df.drop(['date','siteid','Microbialabundanceper_ml', 'ln_nitrate_mean', 'ln_spec_conductance', 'ln_DO', 'ln_chlorophyll', 
             'ln_turbidity', 'ln_fDOM', 'ln_CH4_conc', 'ln_CO2_conc', 'ln_N2O_conc', 'ln_Microbialabundanceper_ml'], axis=1)
y2 = df['ln_Microbialabundanceper_ml']

#data split for log-log
X3 = df.drop(['date','siteid','Microbialabundanceper_ml', 'nitrate_mean', 'spec_conductance', 'DO', 'chlorophyll', 
             'turbidity', 'fDOM', 'CH4_conc', 'CO2_conc', 'N2O_conc', 'ln_Microbialabundanceper_ml'], axis=1)
y3 = df['ln_Microbialabundanceper_ml']

In [4]:
#train and test split, 80:20, for model training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [5]:
#check the shape for each train-test split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

print(X_train3.shape)
print(X_test3.shape)
print(y_train3.shape)
print(y_test3.shape)

(18756, 11)
(4690, 11)
(18756,)
(4690,)
(18756, 11)
(4690, 11)
(18756,)
(4690,)
(18756, 11)
(4690, 11)
(18756,)
(4690,)


## First build level-level RF model

In [6]:
# Define the parameter grid
param_grid={'n_estimators':[100],
            'criterion': ['squared_error'],
            'max_features':[0.1, 0.5, 1.0],
            'random_state': [42]
            }



# Instantiate the RF model
mod1 = RandomForestRegressor()

# Create the GridSearchCV object with k=5 for k-fold cv
grid_search = GridSearchCV(mod1, param_grid, cv=5, n_jobs=-1, scoring='r2')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: {}".format(grid_search.best_params_))
print("Best score: {:.2f}".format(grid_search.best_score_))


Best parameters: {'criterion': 'squared_error', 'max_features': 1.0, 'n_estimators': 100, 'random_state': 42}
Best score: 0.84


In [7]:
# Extract the best parameters from the GridSearchCV object
best_params = grid_search.best_params_

# Initialize a new RF regressor model with the best parameters
regr = RandomForestRegressor(**best_params)

# # Train the model 
regr.fit(X_train, y_train)

# Use the trained model to make predictions on the test data
y_pred = regr.predict(X_test)
print(y_pred)

[ 527303.17037037 1603294.65555556  137389.41111111 ... 3392541.42222222
  282566.62666667  259067.07777778]


In [8]:
# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)

# Calculate the root mean squared error of the predictions
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Calculate the mean absolute error of the predictions
mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared value of the predictions
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error", mae)
print("R-squared:", r2)

Mean Squared Error: 7718404809034.865
Root Mean Squared Error: 2778201.722163973
Mean Absolute Error 718830.1908181729
R-squared: 0.845858465175842


## Build log-level RF model

In [9]:
# Define the parameter grid
param_grid2={'n_estimators':[100],
            'criterion': ['squared_error'],
            'max_features':[0.1, 0.5, 1.0],
            'random_state': [42]
            }



# Instantiate the RF model
mod2 = RandomForestRegressor()

# Create the GridSearchCV object with k=5 for k-fold cv
grid_search2 = GridSearchCV(mod2, param_grid2, cv=5, n_jobs=-1, scoring='r2')

# Fit the GridSearchCV object to the data
grid_search2.fit(X_train2, y_train2)

# Print the best parameters and the best score
print("Best parameters: {}".format(grid_search2.best_params_))
print("Best score: {:.2f}".format(grid_search2.best_score_))


Best parameters: {'criterion': 'squared_error', 'max_features': 1.0, 'n_estimators': 100, 'random_state': 42}
Best score: 0.84


In [10]:
# Extract the best parameters from the GridSearchCV object
best_params2 = grid_search2.best_params_

# Initialize a new RF regressor model with the best parameters
regr2 = RandomForestRegressor(**best_params2)

# # Train the model 
regr2.fit(X_train2, y_train2)

# Use the trained model to make predictions on the test data
y_pred2 = regr2.predict(X_test2)
print(y_pred2)

[12.9069008  14.18892249 11.82365703 ... 14.94172306 12.47359275
 10.78714783]


In [11]:
# Calculate the mean squared error of the predictions
mse2 = mean_squared_error(y_test2, y_pred2)

# Calculate the root mean squared error of the predictions
rmse2 = mean_squared_error(y_test2, y_pred2, squared=False)

# Calculate the mean absolute error of the predictions
mae2 = mean_absolute_error(y_test2, y_pred2)

# Calculate the R-squared value of the predictions
r2_2 = r2_score(y_test2, y_pred2)

print("Mean Squared Error:", mse2)
print("Root Mean Squared Error:", rmse2)
print("Mean Absolute Error", mae2)
print("R-squared:", r2_2)

Mean Squared Error: 0.4265277817168195
Root Mean Squared Error: 0.6530909444455799
Mean Absolute Error 0.3203143053050944
R-squared: 0.8557681227867151


## Build log-log RF model

In [12]:
# Define the parameter grid
param_grid3={'n_estimators':[100],
            'criterion': ['squared_error'],
            'max_features':[0.1, 0.5, 1.0],
            'random_state': [42]
            }



# Instantiate the RF model
mod3 = RandomForestRegressor()

# Create the GridSearchCV object with k=5 for k-fold cv
grid_search3 = GridSearchCV(mod3, param_grid3, cv=5, n_jobs=-1, scoring='r2')

# Fit the GridSearchCV object to the data
grid_search3.fit(X_train3, y_train3)

# Print the best parameters and the best score
print("Best parameters: {}".format(grid_search3.best_params_))
print("Best score: {:.2f}".format(grid_search3.best_score_))


Best parameters: {'criterion': 'squared_error', 'max_features': 1.0, 'n_estimators': 100, 'random_state': 42}
Best score: 0.84


In [13]:
# Extract the best parameters from the GridSearchCV object
best_params3 = grid_search3.best_params_

# Initialize a new RF regressor model with the best parameters
regr3 = RandomForestRegressor(**best_params3)

# Train the model 
regr3.fit(X_train3, y_train3)

# Use the trained model to make predictions on the test data
y_pred3 = regr3.predict(X_test3)
print(y_pred3)

[12.8960146  14.17453117 11.8236384  ... 14.89361382 12.46561497
 10.68505632]


In [14]:
# Calculate the mean squared error of the predictions
mse3 = mean_squared_error(y_test3, y_pred3)

# Calculate the root mean squared error of the predictions
rmse3 = mean_squared_error(y_test3, y_pred3, squared=False)

# Calculate the mean absolute error of the predictions
mae3 = mean_absolute_error(y_test3, y_pred3)

# Calculate the R-squared value of the predictions
r2_3 = r2_score(y_test3, y_pred3)

print("Mean Squared Error:", mse3)
print("Root Mean Squared Error:", rmse3)
print("Mean Absolute Error", mae3)
print("R-squared:", r2_3)

Mean Squared Error: 0.4262503129055406
Root Mean Squared Error: 0.6528784824954339
Mean Absolute Error 0.3206183970608398
R-squared: 0.8558619498461338
