In [1]:
#Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics 

from data_manager import get_data


In [2]:
#Load data
ds2, ds3 = get_data()
print(ds3)
#print(len(ds3[ds3['target'] == 1]))

        TP    DP    Cl    TN      TempC   Chla  Secchi   NP_Cya_bio  target  \
0     39.2  16.2  13.0  0.61   6.494521   1.41     0.5          0.0     0.0   
1     36.8  14.8  17.5  0.45  13.700000   9.67     1.1          0.0     0.0   
2     50.1  27.4  12.1  0.55  14.500000   2.04     0.7          0.0     0.0   
4     59.6  32.6  12.0  0.65  17.700000   4.13     0.6          0.0     0.0   
5     77.3  47.9  10.5  0.62  22.500000   1.74     0.6          0.0     0.0   
...    ...   ...   ...   ...        ...    ...     ...          ...     ...   
3629  53.4  16.8   8.0  0.69  25.600000  27.50     1.1  389000000.0     0.0   
3631  83.4  33.9   8.3  0.71  23.700000  23.94     1.0  133000000.0     0.0   
3632  94.2  40.7   8.7  0.90  22.300000  50.16     1.0  443000000.0     1.0   
3634  68.8  42.6   9.6  0.74  13.400000  10.22     1.4    9460000.0     0.0   
3636  79.4  49.7   9.2  0.76   9.200000  11.81     1.3    6510000.0     0.0   

            N:P  Month  
0     34.410892      4  
1

In [3]:
#Create X and y
X = np.array(ds3.drop(['target', 'NP_Cya_bio'], axis=1))
y = np.array(ds3['target'])
y_reg = np.array(ds3['NP_Cya_bio']) #for regression

In [4]:
#Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y, random_state = 42)

In [104]:
#Random forest!! (Should I scale the data? No, not for trees or forests or PCA.)
trees = 500

model = RandomForestClassifier(n_estimators = trees, max_features = 'auto', criterion = 'gini', class_weight = None, random_state = 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

scores = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(scores)
print(confusion_matrix)

#Feature importance:
model.fit(X,y)
feature_importances = model.feature_importances_
print(feature_importances)

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       283
         1.0       1.00      0.50      0.67         4

    accuracy                           0.99       287
   macro avg       1.00      0.75      0.83       287
weighted avg       0.99      0.99      0.99       287

[[283   0]
 [  2   2]]
[0.14112875 0.09231273 0.08191765 0.09365499 0.07206457 0.33692823
 0.05802903 0.08136422 0.04259983]


In [115]:
#Tuning hyperparams properly:
model = RandomForestClassifier(random_state = 42)

#Use CV to find best parameters: 
best_estimators = []
distros = dict(n_estimators = [10, 50, 100, 500], 
               max_features = ['sqrt', 'log2'],
               min_samples_split = [2,3,4,5,6,7],
               criterion = ['gini', 'entropy'],
               class_weight = ['balanced', None])

search = RandomizedSearchCV(model, distros, scoring='f1', refit='f1', verbose=5, cv=5, n_iter=800, n_jobs=4, pre_dispatch='2*n_jobs', random_state = 42)
search = search.fit(X_train, y_train)
best_estimators.append(search.best_estimator_)
best_estimators.append(search.best_score_)
best_estimators

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 134 tasks      | elapsed:   20.6s
[Parallel(n_jobs=4)]: Done 224 tasks      | elapsed:   38.4s
[Parallel(n_jobs=4)]: Done 350 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 512 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 710 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 944 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 960 out of 960 | elapsed:  3.0min finished


[RandomForestClassifier(bootstrap=True, class_weight='balanced',
                        criterion='entropy', max_depth=None, max_features='sqrt',
                        max_leaf_nodes=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=7, min_weight_fraction_leaf=0.0,
                        n_estimators=500, n_jobs=None, oob_score=False,
                        random_state=42, verbose=0, warm_start=False),
 0.483537296037296]

In [116]:
#Testing the best RF model
model = best_estimators[0]

model.fit(X_train, y_train)

#Testing
y_pred = model.predict(X_test)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print('Recall:', recall)
print('F1', f1)
scores = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(scores)
print(confusion_matrix)

#Feature importance:
model.fit(X,y)
feature_importances = model.feature_importances_
print(feature_importances)

Recall: 0.5
F1 0.6666666666666666
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       283
         1.0       1.00      0.50      0.67         4

    accuracy                           0.99       287
   macro avg       1.00      0.75      0.83       287
weighted avg       0.99      0.99      0.99       287

[[283   0]
 [  2   2]]
[0.10378074 0.05778298 0.14347255 0.12517641 0.03460575 0.20239728
 0.23739454 0.03527471 0.06011504]


In [117]:
#Just for fun, let's try ExtraTrees, too. 
model = ExtraTreesClassifier(n_estimators = 500, criterion = 'gini', class_weight = None, random_state = 42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

scores = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(scores)
print(confusion_matrix)

#The same results. Boo.

#Feature importance:
model.fit(X,y)
feature_importances = model.feature_importances_
print(feature_importances)

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       283
         1.0       1.00      0.50      0.67         4

    accuracy                           0.99       287
   macro avg       1.00      0.75      0.83       287
weighted avg       0.99      0.99      0.99       287

[[283   0]
 [  2   2]]
[0.11703532 0.08118965 0.08003595 0.11125677 0.07682259 0.29985984
 0.08423189 0.08241517 0.06715281]


In [118]:
#Tuning hyperparams properly:
model = ExtraTreesClassifier()

#Use CV to find best parameters: 
best_estimators = []
distros = dict(n_estimators = [10, 50, 100, 500], 
               max_features = ['sqrt', 'log2'],
               min_samples_split = [2,3,4,5,6,7],
               criterion = ['gini', 'entropy'],
               class_weight = ['balanced', None])

search = RandomizedSearchCV(model, distros, scoring='f1', refit='f1', verbose=5, cv=5, n_iter=800, n_jobs=4, pre_dispatch='2*n_jobs')
search = search.fit(X_train, y_train)
best_estimators.append(search.best_estimator_)
best_estimators.append(search.best_score_)
best_estimators

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 158 tasks      | elapsed:   19.0s
[Parallel(n_jobs=4)]: Done 248 tasks      | elapsed:   29.2s
[Parallel(n_jobs=4)]: Done 374 tasks      | elapsed:   44.6s
[Parallel(n_jobs=4)]: Done 536 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 734 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 960 out of 960 | elapsed:  2.0min finished


[ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
                      criterion='entropy', max_depth=None, max_features='sqrt',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=7, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False),
 0.44454295704295704]

In [119]:
#Testing the best ET model
model = best_estimators[0]
model.fit(X_train, y_train)

#Testing
y_pred = model.predict(X_test)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print('Recall:', recall)
print('F1', f1)
scores = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(scores)
print(confusion_matrix)

#Feature importance:
model.fit(X,y)
feature_importances = model.feature_importances_
print(feature_importances)

Recall: 0.5
F1 0.6666666666666666
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       283
         1.0       1.00      0.50      0.67         4

    accuracy                           0.99       287
   macro avg       1.00      0.75      0.83       287
weighted avg       0.99      0.99      0.99       287

[[283   0]
 [  2   2]]
[0.1260745  0.05872883 0.08968917 0.10816764 0.04725456 0.19245005
 0.20842596 0.05175708 0.1174522 ]


In [120]:
#Finally, let's try a random forest regression!

#Split the data for regression:
X_train, X_test, yr_train, yr_test = train_test_split(X, y_reg, test_size=0.20, random_state = 42)

trees = 500

model = RandomForestRegressor(n_estimators = trees, max_features = 'auto', oob_score = True, random_state = 42)
model.fit(X_train, yr_train)
print(model.feature_importances_)
print('validation score:', model.oob_score_) #returns R^2 values using out of bag values as test sets
     
#Hmm. Doesn't seem great.
y_pred = model.predict(X_test)
r2 = metrics.r2_score(yr_test, y_pred)
print('test score:', r2)

[0.04585    0.03725391 0.04604401 0.04269727 0.02843661 0.70201554
 0.02394555 0.03594694 0.03781018]
validation score: 0.5747329326549762
test score: 0.45141993840839256


In [51]:
#Tuning hyperparams properly:
model = RandomForestRegressor()

#Use CV to find best parameters: 
best_estimators = []
distros = dict(n_estimators = [10, 50, 100, 500], 
               max_features = ['sqrt', 'log2'],
               min_samples_split = [2,3,4,5,6,7])

search = RandomizedSearchCV(model, distros, scoring='r2', refit='r2', verbose=5, cv=5, n_iter=800, n_jobs=4, pre_dispatch='2*n_jobs')
search = search.fit(X_train, yr_train)
best_estimators.append(search.best_estimator_)
best_estimators.append(search.best_score_)
best_estimators

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   17.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   36.3s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 480 out of 480 | elapsed:  2.0min finished


[RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 0.1688903739754343]

In [77]:
#Testing RF Regressor
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=42,
                       verbose=0, warm_start=False)
model.fit(X_train, yr_train)
print(model.feature_importances_)
print(model.oob_score_) #returns R^2 values using out of bag values as test sets

y_pred = model.predict(X_test)
r2 = metrics.r2_score(yr_test, y_pred)
print(r2)

[0.11477587 0.05697833 0.08451433 0.0786059  0.04989842 0.46342385
 0.05082612 0.06512972 0.03584745]
0.538809447659649
0.46226178564522946


#### In Conclusion...
It doesn't seem like Random Forest does that well for this data set. On the other hand, it does a better job than linear regression.

Tune: n_estimators, max_features, and min_samples_split, and class_weight by cross-validated grid search. 

Tuning hyperparams with RandomizedSearchCV did slightly worse than the default parameters + n_estimators chosen with 5-fold cross validation, so let's stick with the simpler models. 

No difference between Random Forest and Extra Trees for classification, so we'll ONLY use results from the first random forest classification and the first random forest regression.
