# Regression on biodiversity index

We are going to test different models on our dataset, trying to get a better results using a grid search and the testing the models on a dataset of a different region.

For each model we're going to use a RandomizedSearchCV to narrow our parameters research and the the GridSearchCV to find the best one.

In [29]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

In [41]:
regression_label = 'habitat_richness'
folder = "../Dataset"

regions = ['france', 'finland']

detect = 'custom_set'
handle = 'mean'

file_france = "/france_yearavg_out_mean_handle_custom_set.csv"
file_finland =  "/finland_yearavg_out_mean_handle_custom_set.csv"

df_france = pd.read_csv(folder + file_france, index_col=['longitude', 'latitude'])
df_finland = pd.read_csv(folder + file_finland, index_col=['longitude', 'latitude'])

df_france = df_france.dropna(axis=1)
df_finland = df_finland.dropna(axis=1)

dfs = [df_france, df_finland]

ys = [df[regression_label].values.reshape(-1,1) for df in dfs]
Xs = [df.drop(columns=[regression_label]).values for df in dfs]

cv = KFold(n_splits=10, shuffle=True)

In [42]:
data = {}

X_all = np.vstack(Xs)
y_all = np.vstack(ys)

for X, y, region in zip(Xs, ys, regions):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    data[region] = {'X': X, 'y': y, 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test}

X_train_all = np.vstack([data[region]['X_train'] for region in data])    
y_train_all = np.vstack([data[region]['y_train'] for region in data])     
X_test_all = np.vstack([data[region]['X_test'] for region in data])    
y_test_all = np.vstack([data[region]['y_test'] for region in data])    

# Normalize the data
X_scaler = preprocessing.MinMaxScaler()
y_scaler = preprocessing.MinMaxScaler()

# Test values have to be normalized with the training mean and std
X_scaler.fit(X_train_all)
y_scaler.fit(y_train_all.reshape(-1, 1))

for region in data:
    data[region]['X'] = X_scaler.transform(data[region]['X'])
    data[region]['X_train'] = X_scaler.transform(data[region]['X_train'])
    data[region]['X_test'] = X_scaler.transform(data[region]['X_test'])
    data[region]['y'] = y_scaler.transform(data[region]['y'])
    data[region]['y_train'] = y_scaler.transform(data[region]['y_train'])
    data[region]['y_test'] = y_scaler.transform(data[region]['y_test'])


X_train_all = X_scaler.transform(X_train_all)
X_test_all = X_scaler.transform(X_test_all)

y_train_all = y_scaler.transform(y_train_all.reshape(-1, 1)).ravel()
y_test_all = y_scaler.transform(y_test_all.reshape(-1, 1)).ravel()
y = y_scaler.transform(y.reshape(-1, 1)).ravel()

In [43]:
df_finland.shape

(18367, 47)

In [44]:
df_france.shape

(2024, 47)

### Choose region

In [67]:
region = 'france'

X_train = data[region]['X_train']
y_train = data[region]['y_train']
X_test = data[region]['X_test']
y_test = data[region]['y_test']

## Random Forest Regressor

Parameters for RandomSearch

In [38]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 4)]
max_features = [None, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 4)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
param_grid

{'n_estimators': [50, 100, 150, 200],
 'max_features': [None, 'sqrt'],
 'max_depth': [10, 40, 70, 100, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [41]:
rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = cv, n_jobs = -1, verbose = 5)
grid_search.fit(X_train, y_train)

NameError: name 'random_grid' is not defined

In [56]:
best_grid = grid_search.best_estimator_
grid_search.best_params_

NameError: name 'grid_search' is not defined

In [53]:
df_france.shape

(2024, 47)

In [51]:
best_params = {'bootstrap': False,
 'max_depth': 70,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 150}

base_model = RandomForestRegressor(random_state = 42)
base_model.fit(X_train_all, y_train_all.ravel())
print("Base model score: ", base_model.score(X_test_all, y_test_all.ravel()))
best_grid = RandomForestRegressor(**best_params)
best_grid.fit(X_train, y_train.ravel())
print("Best grid search: ", best_grid.score(X_test, y_test.ravel()))


Base model score:  0.8351895228927788
Best grid search:  0.7341661933117509


### Using finland as train and france as test

In [None]:
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(X_train_all, y_train_all.ravel())
print("score: ", rfr.score(X_test_all, y_test_all.ravel()))

## Neural network

In [55]:
hidden_layer_sizes=[(100, 50), (100, 100, 50), (50, 100, 50), (200, 100, 50)]
activation=['logistic', 'tanh', 'relu']
solver=['sgd', 'adam']

param_grid = {'hidden_layer_sizes': hidden_layer_sizes,
               'activation': activation,
             'solver': solver}

nn = MLPRegressor(random_state=42)
grid_search = GridSearchCV(estimator = nn, param_grid = param_grid, 
                          cv = cv, n_jobs = -1, verbose = 5)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.1s


KeyboardInterrupt: 

In [56]:
def nn_regressor(Ni, No, Ns, alpha=5, max_hidden=10, params={'random_state': 42}):
    '''
    Create a nn with
    
    Nh = Ns / (alpha * (Ni + No))
    
    layers
    
    Parameters
    ----------
    
    Nh : number of hidden layers
    Ns : size of our training data
    Ni : input features
    No : output features
    
    Return
    ------
    nn : MLPRegressor
    
    '''
    
    Nh = int(Ns / (alpha * (Ni + No))) 
    Nh = max_hidden if Nh > max_hidden else Nh
    
    
    hidden_layer_sizes = [ int((Ni*3)/i) for i in range(1, Nh+1)  ]
    print(hidden_layer_sizes)
    return MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, **params)
    
    print(Nh)

X_train.shape

(14693, 46)

In [68]:
params={'solver': 'adam', 'random_state': 42, 'alpha': 0.001}

nn = nn_regressor(38, 1, X_train.shape[0], alpha=10, max_hidden=5, params=params)

[114, 57, 38, 28]


In [69]:
nn.fit(X_train, y_train.ravel())
print("Base model score: ", nn.score(X_test, y_test.ravel()))


Base model score:  0.327966089571858


In [72]:
X_train.shape

(1619, 46)

In [73]:
regr = MLPRegressor(hidden_layer_sizes=(100, 50), random_state=42).fit(X_train, y_train)
print("Base model score: ", regr.score(X_test, y_test))

#best = grid_search.best_estimator_
#best.fit(X_train, y_train)
#print("Best grid search: ", best.score(X_test, y_test))

  return f(**kwargs)


Base model score:  0.27536362267723447


In [17]:
grid_search.best_params_

{'activation': 'relu', 'hidden_layer_sizes': (100, 100, 50), 'solver': 'adam'}

In [49]:
regr = LinearRegression().fit(X_train, y_train)
print("Base model score: ", regr.score(X_test, y_test))


Base model score:  0.5798518318833471
