In [1]:
#Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier


from sklearn import metrics 


In [10]:
#Load data
path = os.getcwd() + '/data/AllSites.csv'
ds1 = pd.read_csv(path)

#Remove rows with missing cyanobacteria values
ds1 = ds1.dropna(axis=0, subset = ['NP_Cya_bio'])

#Remove rows with any missing value? 
#Turns out that sklearn can't handle missing values so lets delete those rows for now. 
#Later we can fill them in with monthly means.
#ds1 = ds1.dropna(axis=0, how='any')

print(ds1.shape)
#print(ds1)

(1431, 14)


In [12]:
#Create cleaned dataframe, ds2:
ds2 = ds1.drop(['StationID', 'Station', 'Date', 'Time', 'Stratum', 'Depth'], axis=1)

#Use regex to remove 'H's and 'J's
ds2['TP'] = ds1['TP'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['DP'] = ds1['DP'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Cl'] = ds1['Cl'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['TN'] = ds1['TN'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['TempC'] = ds1['TempC'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Chla'] = ds1['Chla'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Secchi'] = ds1['Secchi'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Month'] = ds1['Date'].astype(str).str.extract('(\d\d)').astype(int) # This is just the month number
ds2['N:P'] = ((ds2['TN']*1e-3)/14.007)/((ds2['TP']*1e-6)/30.974)
ds2['Target'] = [1 if x >= 4e8 else 0 for x in ds2['NP_Cya_bio']]

#Fill in  missing vals with monthly means
for colname in ds2:
    ds2[colname] = ds2[colname].fillna(ds2.groupby('Month')[colname].transform('mean'))


#Take a look
ds2
ds2[ds2['Target'] == 1]

Unnamed: 0,TP,DP,Cl,TN,TempC,Chla,Secchi,NP_Cya_bio,Month,N:P,Target
936,15.7,9.3,12.6,0.35,22.1,10.4,2.3,426000000.0,8,49.297008,1
940,17.9,14.217308,12.3,0.378976,20.2,10.3,1.5,525000000.0,9,40.393809,1
3037,37.0,12.1,10.6,0.74,24.7,28.2,1.1,711000000.0,8,44.226458,1
3038,41.7,13.5,10.9,0.61,20.5,25.38,1.3,728000000.0,8,32.347889,1
3269,75.5,23.2,6.7,0.64,17.3,36.0,1.1,423000000.0,9,18.744989,1
3271,69.4,33.6,7.3,0.63,13.0,12.6,1.3,705000000.0,10,20.073969,1
3307,66.4,16.6,5.9,0.74,21.3,44.9,1.0,1260000000.0,8,24.644261,1
3400,40.2,16.0,7.7,0.8,23.8,45.03,1.5,1170000000.0,7,44.006426,1
3461,62.9,18.0,6.3,0.77,19.3,3.73,1.2,438000000.0,9,27.070249,1
3464,73.7,19.5,6.6,0.8,17.3,42.4,0.8,1540000000.0,9,24.003505,1


In [13]:
#Create X and y
X = np.array(ds2.drop(['Target', 'NP_Cya_bio'], axis=1))
y = np.array(ds2['Target'])
y_reg = np.array(ds2['NP_Cya_bio']) #for regression

In [5]:
#Split the data - I think in this case since we're not really tuning any hyperparams it's ok to use cv for testing.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y)

In [16]:
#Random forest!! (Should I scale the data? No, not for trees or forests or PCA.)
trees = 500

model = RandomForestClassifier(n_estimators = trees, max_features = 'sqrt', criterion = 'entropy', class_weight = 'balanced')
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)

y_pred = cross_val_predict(model, X, y, cv=5)

scores = metrics.classification_report(y, y_pred)
confusion_matrix = metrics.confusion_matrix(y, y_pred)
print(scores)
print(confusion_matrix)

#Feature importance:
model.fit(X,y)
feature_importances = model.feature_importances_
print(feature_importances)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1411
           1       0.17      0.45      0.25        20

    accuracy                           0.96      1431
   macro avg       0.58      0.71      0.61      1431
weighted avg       0.98      0.96      0.97      1431

[[1367   44]
 [  11    9]]
[0.11181587 0.05340138 0.11621255 0.131025   0.03522008 0.23593707
 0.21628625 0.06075251 0.03934928]


In [17]:
#Just for fun, let's try ExtraTrees, too. 
model = ExtraTreesClassifier(n_estimators = trees, bootstrap = False, criterion = 'entropy', class_weight = 'balanced')
model.fit(X, y)
print(model.feature_importances_)

y_pred = cross_val_predict(model, X, y, cv=5)

scores = metrics.classification_report(y, y_pred)
confusion_matrix = metrics.confusion_matrix(y, y_pred)
print(scores)
print(confusion_matrix)

#Almost the same results. Boo.

[0.1302294  0.06697338 0.10030273 0.10152492 0.05103371 0.20762102
 0.18262459 0.10621895 0.05347131]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1411
           1       0.18      0.45      0.26        20

    accuracy                           0.96      1431
   macro avg       0.59      0.71      0.62      1431
weighted avg       0.98      0.96      0.97      1431

[[1371   40]
 [  11    9]]


In [21]:
#Finally, let's try a random forest regression!
trees = 500

model = RandomForestRegressor(n_estimators = trees, max_features = 'auto', oob_score = True)
model.fit(X, y_reg)
print(model.feature_importances_)
model.oob_score_ #returns R^2 values using out of bag values as test sets

#Hmm. Doesn't seem great.

[0.04037452 0.05654107 0.04412462 0.05937667 0.03106359 0.67897487
 0.02131057 0.02837077 0.03986331]


0.5499115281704698

#### In Conclusion...
It doesn't seem like Random Forest is the best choice of model for this data set. Most unfortunately.

On the other hand, it does a better job than linear regression.

Next steps: clean this up: actually tuning hyperparams with CV then doing a separate test set.
Tune: n_estimators, max_features, and min_samples_split, and class_weight by cross-validated grid search. Try both RF and ET (they're slightly different, something about ET not using bootstrapping?) as well as regression.

And avoid using cross val predict for scoring?
Better to use cross_val_score? or not?