# Regression on biodiversity index

We are going to test different models on our dataset, trying to get a better results using a grid search and the testing the models on a dataset of a different region.

For each model we're going to use a RandomizedSearchCV to narrow our parameters research and the the GridSearchCV to find the best one.

In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor

In [13]:
regression_label = 'habitat_richness'

df = pd.read_csv("merged_dataset.csv", index_col=['longitude', 'latitude'])

y = df[regression_label].values
X = df.drop(columns=[regression_label]).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)


cv = KFold(n_splits=3, shuffle=True)

## PCA

In [20]:
pca = PCA()
pca.fit(X)
print("Cumulative levariance with 15 PC: ", pca.explained_variance_ratio_.cumsum()[6])
X_pca = pca.transform(X)[:, 0:6]

Cumulative levariance with 15 PC:  0.8916770812430184


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Random Forest Regressor

Parameters for RandomSearch

In [10]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = [None, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': [None, 'sqrt'],
 'max_depth': [10, 32, 55, 77, 100],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [11]:
# Use the random grid to search for best hyperparameters

rf = RandomForestRegressor()
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = cv, verbose=10, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  7

RandomizedSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
                   estimator=RandomForestRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 32, 55, 77, 100],
                                        'max_features': [None, 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=10)

In [24]:
base_model = RandomForestRegressor(random_state = 42)
base_model.fit(X_train, y_train)
print("Base model score: ", base_model.score(X_test, y_test))
best_random = rf_random.best_estimator_
print("Best grid search: ", best_random.score(X_test, y_test))
rf_random.best_params_

Base model score:  0.5001795536621727


ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 46 

## Random Forest

In [None]:
linear_model = LinearRegression(fit_intercept = True, normalize = True, copy_X = True, n_jobs = 2).fit(X_train, y_train)
rand_forest_model = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)

print("Linear Regression score: " , linear_model.score(X_test,y_test))
print("Random forest score: ", rand_forest_model.score(X_test, y_test))

print("Linear Regression cross validation: ", np.mean(cross_val_score(linear_model, X, y, cv=cv)))
print("Random forest cross validation: ", np.mean(cross_val_score(rand_forest_model, X, y, cv=cv)))

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

## Linear Regressor