In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
%matplotlib inline
import sklearn

### Current samples times above the threshold

In [5]:
ds1 = pd.read_csv('data/AllSites.csv')
threshold = np.where(ds1['NP_Cya_bio'] >= 4e8)
target = np.zeros(len(ds1['NP_Cya_bio']))
target[threshold] = 1
ds1['target'] = pd.Series(target)
ds1 = ds1.dropna(axis=0, how='any')

In [6]:
t = np.where(ds1['target'] == 1.0)
ds1.iloc[t]
# ds1

Unnamed: 0,StationID,Station,Date,Time,Stratum,Depth,TP,DP,Cl,TN,TempC,Chla,Secchi,NP_Cya_bio,target
936,9,Otter Creek Segment,08/18/11,1235.0,E,4.6,15.7,9.3,12.6,H0.35,22.1,10.4,2.3,426000000.0,1.0
3037,40,St. Albans Bay,08/06/14,1210.0,U,2.2,37,12.1,10.6,0.74,24.7,28.2,1.1,711000000.0,1.0
3038,40,St. Albans Bay,08/19/14,1010.0,U,2.6,41.7,13.5,10.9,0.61,20.5,25.38,1.3,728000000.0,1.0
3269,50,Missisquoi Bay,09/22/06,1100.0,U,2.2,75.5,23.2,6.7,0.64,17.3,36,1.1,423000000.0,1.0
3271,50,Missisquoi Bay,10/06/06,1100.0,U,2.6,69.4,33.6,7.3,0.63,13.0,12.6,1.3,705000000.0,1.0
3307,50,Missisquoi Bay,08/20/08,1050.0,U,2.0,66.4,16.6,5.9,0.74,21.3,44.9,1.0,1260000000.0,1.0
3400,50,Missisquoi Bay,07/30/13,1020.0,U,3.0,40.2,16.0,7.7,0.8,23.8,45.03,1.5,1170000000.0,1.0
3461,51,Missisquoi Bay Central,09/07/06,1055.0,U,2.4,62.9,18.0,6.3,0.77,19.3,3.73,1.2,438000000.0,1.0
3464,51,Missisquoi Bay Central,09/22/06,1040.0,U,2.0,73.7,19.5,6.6,0.8,17.3,42.4,0.8,1540000000.0,1.0
3499,51,Missisquoi Bay Central,08/06/08,1115.0,U,2.0,74.1,29.7,7.1,0.81,23.2,45.5,1.1,1190000000.0,1.0


### Using regex capture groups to clean 'H's' out of data

In [7]:
print_data = False
show_positive_cases = False
ds2 = ds1.drop(['Station', 'Stratum','Date','StationID','Time'], axis=1)
# ds2
ds2['Depth'] = ds1['Depth'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['TP'] = ds1['TP'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Cl'] = ds1['Cl'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['DP'] = ds1['DP'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['TN'] = ds1['TN'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['TempC'] = ds1['TempC'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Chla'] = ds1['Chla'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Chla'] = ds1['Chla'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['Secchi'] = ds1['Secchi'].astype(str).str.extract('([-+]?\d*\.\d+|\d+)').astype(float)
ds2['date'] = ds1['Date'].astype(str).str.extract('(\d)').astype(int) # This is just the month number
ds2 = ds2.drop(['NP_Cya_bio'], axis=1)
y = np.array(ds2['target'])
X = np.array(ds2.drop(['target'], axis=1))

if show_positive_cases:
    ds1.iloc[t]
if print_data:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(ds2)
print(ds2['date'])        

1       0
2       0
4       0
5       0
7       0
       ..
3629    0
3631    0
3632    0
3634    1
3636    1
Name: date, Length: 1298, dtype: int64


### Simple SVM Testing

In [None]:
# change from normalize to scale 
from sklearn import svm
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict

X_normalized = preprocessing.normalize(X) # normalize X for processing
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
print('C=5\n')
for kernel in kernels:
    svm_classifer = svm.SVC(C=5, kernel=kernel)
    y_predicted = cross_val_predict(svm_classifer, X, y, cv=20)
    print(f'KERNEL: {kernel}')
    print(metrics.classification_report(y, y_predicted))
    print(metrics.confusion_matrix(y, y_predicted))
    print('\n')
print('C=50\n')
for kernel in kernels:
    svm_classifer = svm.SVC(C=50, kernel=kernel)
    y_predicted = cross_val_predict(svm_classifer, X, y, cv=5)
    print(f'KERNEL: {kernel}')
    print(metrics.classification_report(y, y_predicted))
    print(metrics.confusion_matrix(y, y_predicted))
    print('\n')

In [None]:
# change from normalize to scale 
# eval with ROC AUC stuff
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn import svm
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import recall_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_val_predict

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)
y_train_pos = np.where(y_train == 1)
y_test_pos= np.where(y_test == 1)
X_normalized = preprocessing.normalize(X) # normalize X for processing

assert len(y_test_pos[0]) >= 5, "Need at least 5 positive samples in training set"

loo = LeaveOneOut()
X_train_normalized = preprocessing.normalize(X_train) # normalize X for processing
svm_classifer = svm.SVC()
kernels = ['linear', 'rbf', 'sigmoid']
scorer = make_scorer(recall_score, zero_division=0)
final_params = []
for kernel in kernels:
    print(f'KERNEL: {kernel}')
    svm_classifer = svm.SVC(kernel=kernel)
    distros = dict(C=np.logspace(-2, 10, 10), gamma=np.logspace(-9, 3, 10))
    print('optimizing hyperparams...')
    search = RandomizedSearchCV(svm_classifer, distros, scoring=scorer, n_iter=50, verbose=10, cv=len(y_train_pos[0]), n_jobs=4)
    search = search.fit(X_train_normalized, y_train)
    final_params.append(search.best_params_)
    params = search.best_params_

for params in final_params:
    print(f'hyperparamters: {params}')
    print('testing optimized hyperparams...')
    X_test_normalized = preprocessing.normalize(X_test)
    svm_classifer = svm.SVC(**params)
    y_predicted = cross_val_predict(svm_classifer, X_normalized, y, cv=loo, verbose=10, n_jobs=4, pre_dispatch='2*n_jobs')
    print(metrics.classification_report(y, y_predicted))
    print(metrics.confusion_matrix(y, y_predicted))
    print('\n')
print(final_params)

KERNEL: linear
optimizing hyperparams...
Fitting 12 folds for each of 50 candidates, totalling 600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1935s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0165s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:    4.5s
[Parallel(n_jobs=4)]: Batch computation too slow (5.2331s.) Setting batch_size=1.
[Parallel(n_jobs=4)]: Done  56 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0628s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done  66 tasks      | elapsed:    9.9s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1466s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Batch computation too slow (55.5749s.) Setting batch_size=1.
[Parallel(n_jobs=4)]: Done  98 tasks      | elapsed:  1.7min
[Para

In [19]:
for params in final_params:
    print(f'hyperparamters: {params}')
    print('testing optimized hyperparams...')
    X_test_normalized = preprocessing.normalize(X_test)
    svm_classifer = svm.SVC(**params)
    svm_classifer.fit(X_train_normalized, y_train)
#     = cross_val_predict(svm_classifer, X_test_normalized, y_test, cv=len(y_test_pos[0]), verbose=10, n_jobs=4, pre_dispatch='2*n_jobs')
    y_predicted = svm_classifer.predict(X_test)
    print(metrics.classification_report(y_test, y_predicted))
    print(metrics.confusion_matrix(y_test, y_predicted))
    print('\n')
print(final_params)

hyperparamters: {'gamma': 0.1, 'C': 464158883.3612773}
testing optimized hyperparams...


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       255
         1.0       0.00      0.00      0.00         5

    accuracy                           0.98       260
   macro avg       0.49      0.50      0.50       260
weighted avg       0.96      0.98      0.97       260

[[255   0]
 [  5   0]]


hyperparamters: {'gamma': 2.1544346900318822e-08, 'C': 10000000000.0}
testing optimized hyperparams...
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97       255
         1.0       0.22      0.80      0.35         5

    accuracy                           0.94       260
   macro avg       0.61      0.87      0.66       260
weighted avg       0.98      0.94      0.96       260

[[241  14]
 [  1   4]]


hyperparamters: {'gamma': 0.1, 'C': 464158883.3612773}
testing optimized hyperparams...
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       

  _warn_prf(average, modifier, msg_start, len(result))


### Perform PCA to view data


In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

X_normalized = preprocessing.normalize(X) # normalize X for processing
principal_components = PCA(n_components=2)
principal_components.fit(X_normalized)
X_reduced = principal_components.fit_transform(X_normalized)
X_reduced = preprocessing.scale(X_reduced)
fig, ax = plt.subplots(figsize=(10,6))
scatter_plot = ax.scatter(X_reduced[:,0], X_reduced[:,1], c=np.array(ds1['NP_cya_bio'])), cmap=plt.cm.coolwarm )
fig.colorbar(scatter_plot, ax=ax )
ax.set_xlabel('X1')
ax.set_ylabel('X2')
# for ii, xx in enumerate(X_reduced):
#     if y[ii] == 0:
#         ax[0].scatter(xx[0], xx[1], c='r')
#     else:
#         ax[1].scatter(xx[0], xx[1], c='b')
# plt.setp(ax, xlim=(-1,1.6), ylim=(-4,4))

## 