# Regression on biodiversity index

We are going to test different models on our dataset, trying to get a better results using a grid search and the testing the models on a dataset of a different region.

For each model we're going to use a RandomizedSearchCV to narrow our parameters research and the the GridSearchCV to find the best one.

In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
swi_labels = ['SWI1km-SWI-002', 'SWI1km-SWI-100',
       'SWI1km-SWI-040', 'SWI1km-SWI-005', 'SWI1km-SWI-010', 'SWI1km-SWI-060',
       'SWI1km-SWI-015', 'SWI1km-SWI-020']



regression_label = 'habitat_richness'
folder = "../Dataset"
file = "/france.csv"

df = pd.read_csv(folder + file, index_col=['longitude', 'latitude'])
#df = df[df[regression_label] > 0]
y = df[regression_label].values
X = df.drop(columns=[regression_label]).values #returns a numpy array



cv = KFold(n_splits=10, shuffle=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Normalize the data
X_scaler = preprocessing.MinMaxScaler()
y_scaler = preprocessing.MinMaxScaler()


# Test values have to be normalized with the training mean and std
y_scaler.fit(y_train.reshape(-1, 1))
y_train = y_scaler.transform(y_train.reshape(-1, 1)).ravel()
y_test = y_scaler.transform(y_test.reshape(-1, 1)).ravel()
y = y_scaler.transform(y.reshape(-1, 1)).ravel()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)
X = X_scaler.transform(X)


## Random Forest Regressor

Parameters for RandomSearch

In [4]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 4)]
max_features = [None, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 4)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
param_grid

{'n_estimators': [50, 100, 150, 200],
 'max_features': [None, 'sqrt'],
 'max_depth': [10, 40, 70, 100, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [41]:
rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = cv, n_jobs = -1, verbose = 5)
grid_search.fit(X_train, y_train)

NameError: name 'random_grid' is not defined

In [None]:
best_grid = grid_search.best_estimator_
grid_search.best_params_

In [5]:
best_params = {'bootstrap': False,
 'max_depth': 70,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 150}

base_model = RandomForestRegressor(random_state = 42)
base_model.fit(X_train, y_train)
print("Base model score: ", base_model.score(X_test, y_test))
best_grid = RandomForestRegressor(**best_params)
best_grid.fit(X_train, y_train)
print("Best grid search: ", best_grid.score(X_test, y_test))


Base model score:  0.7652010748725563
Best grid search:  0.7627767421836147


## Neural network

In [11]:
hidden_layer_sizes=[(100, 50), (100, 100, 50), (50, 100, 50), (200, 100, 50)]
activation=['logistic', 'tanh', 'relu']
solver=['sgd', 'adam']

param_grid = {'hidden_layer_sizes': hidden_layer_sizes,
               'activation': activation,
             'solver': solver}

nn = MLPRegressor(random_state=42)
grid_search = GridSearchCV(estimator = nn, param_grid = param_grid, 
                          cv = cv, n_jobs = -1, verbose = 5)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.3min finished


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             estimator=MLPRegressor(random_state=42), n_jobs=-1,
             param_grid={'activation': ['logistic', 'tanh', 'relu'],
                         'hidden_layer_sizes': [(100, 50), (100, 100, 50),
                                                (50, 100, 50), (200, 100, 50)],
                         'solver': ['sgd', 'adam']},
             verbose=5)

In [12]:
regr = MLPRegressor(hidden_layer_sizes=(100, 50), random_state=42).fit(X_train, y_train)
print("Base model score: ", regr.score(X_test, y_test))

best = grid_search.best_estimator_
best.fit(X_train, y_train)
print("Best grid search: ", best.score(X_test, y_test))


Base model score:  0.665287285092033
Best grid search:  0.5922529917572428


In [13]:
grid_search.best_params_

{'activation': 'relu', 'hidden_layer_sizes': (200, 100, 50), 'solver': 'adam'}