# Regression on biodiversity index

We are going to test different models on our dataset, trying to get a better results using a grid search and the testing the models on a dataset of a different region.

For each model we're going to use a RandomizedSearchCV to narrow our parameters research and the the GridSearchCV to find the best one.

In [164]:
import pandas as pd
import numpy as np
import glob
import os
import sys
import torch

from sklearn.decomposition import PCA

from sklearn import preprocessing

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

module_this = os.path.abspath(os.path.join(os.getcwd()))
modules = [ module_this]

for module in modules:
    if module not in sys.path:
        sys.path.append(module)

import utils as ut

## Load Datasets

In [152]:
folder = "../Dataset"
regression_label = 'habitat_richness'
test_size = 0.2

datas = []

paths = [f for f in glob.glob(folder + "/*.csv") if 'france_out_remove_handle_set_null' in f]
for path in paths:
    df = pd.read_csv(path, index_col=['longitude', 'latitude'])

    if(df.isna().any().any()):
        print(path, '\t has ', df.isna().any().sum(), ' row with null values')
    
    y = df[regression_label].values
    X = df.drop(columns=[regression_label]).values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)
    datas.append({'name': path[11:-4], 'dataframe': df, 'y': y, 'X': X, 
                'X_train': X_train, 'X_test': X_test, 
            'y_train': y_train, 'y_test': y_test})

## Random Forest Regressor

In [100]:
cv = KFold(n_splits=5, shuffle=True, random_state = 42)
scaler = preprocessing.MinMaxScaler()

for data in datas:
    rfr = RandomForestRegressor(random_state = 42)
    model = make_pipeline(scaler, rfr)
    val_score = cross_val_score(model, data['X_train'], data['y_train'], cv=cv, n_jobs=-1, verbose=0)
    data['val_score'] = val_score
    print(data['path'], "\t validation score: \t", "{:.3f}".format(val_score.mean()), " +/- ", "{:.3f}".format(val_score.std()))
    model.fit(data['X_train'], data['y_train'])
    test_score = model.score(data['X_test'], data['y_test'])
    print(data['name'], "\t test score: \t\t", "{:.3f}".format(test_score))
    print('--------------------------------------------')

    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.0s finished


../Dataset\france_out_remove_handle_set_null.csv 	 validation score: 	 0.734  +/-  0.027
../Dataset\france_out_remove_handle_set_null.csv 	 test score: 		 0.735
--------------------------------------------


Parameters for RandomSearch

In [177]:
n_estimators = [50, 100, 200, 500, 1000]
max_features = ['auto', 'sqrt']
max_depth = [10, 20, 50, 100, None]
max_depth = [50]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

grid_params = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               'rfr__min_samples_split': min_samples_split,
               'rfr__min_samples_leaf': min_samples_leaf,
               'rfr__bootstrap': bootstrap}
grid_params

{'rfr__n_estimators': [50, 100, 200, 500, 1000],
 'rfr__max_features': ['auto', 'sqrt'],
 'rfr__max_depth': [50, None],
 'rfr__min_samples_split': [2, 5, 10],
 'rfr__min_samples_leaf': [1, 2, 4],
 'rfr__bootstrap': [True, False]}

In [176]:

for data in datas:
    rfr = RandomForestRegressor()
    model = Pipeline([('scaler', scaler), ('rfr', rfr)])

    grid_search = GridSearchCV(estimator = model, param_grid = grid_params, 
                              cv = cv, n_jobs = -1, verbose = 0)
    grid_search.fit(data['X_train'], data['y_train'])
    print(data['name'], ' cross validation best score: \t',  "{:.3f}".format(grid_search.best_score_))
    torch.save(grid_search.best_params_, data['name'] + "__bset_params")
    data['rfr_best'] = grid_search.best_params_
    
    best_model = grid_search.best_estimator_
    best_model.fit(data['X_train'], data['y_train'])
    test_score = best_model.score(data['X_test'], data['y_test'])
    print(data['name'], "\t best model test score: \t\t", "{:.3f}".format(test_score))
    print('--------------------------------------------')
    

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:   17.4s finished


france_out_remove_handle_set_null  cross validation best score: 	 0.717
france_out_remove_handle_set_null 	 best model test score: 		 0.748
--------------------------------------------
