In [183]:
#Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

from data_manager import get_data


from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV


In [184]:
ds2, ds3 = get_data()
print(ds3)
print(len(ds3[ds3['target'] == 1]))

        TP    DP    Cl    TN      TempC   Chla  Secchi   NP_Cya_bio  target  \
0     39.2  16.2  13.0  0.61   6.494521   1.41     0.5          0.0     0.0   
1     36.8  14.8  17.5  0.45  13.700000   9.67     1.1          0.0     0.0   
2     50.1  27.4  12.1  0.55  14.500000   2.04     0.7          0.0     0.0   
4     59.6  32.6  12.0  0.65  17.700000   4.13     0.6          0.0     0.0   
5     77.3  47.9  10.5  0.62  22.500000   1.74     0.6          0.0     0.0   
...    ...   ...   ...   ...        ...    ...     ...          ...     ...   
3629  53.4  16.8   8.0  0.69  25.600000  27.50     1.1  389000000.0     0.0   
3631  83.4  33.9   8.3  0.71  23.700000  23.94     1.0  133000000.0     0.0   
3632  94.2  40.7   8.7  0.90  22.300000  50.16     1.0  443000000.0     1.0   
3634  68.8  42.6   9.6  0.74  13.400000  10.22     1.4    9460000.0     0.0   
3636  79.4  49.7   9.2  0.76   9.200000  11.81     1.3    6510000.0     0.0   

            N:P  Month  
0     34.410892      4  
1

In [185]:
#Create X and y
X = np.array(ds3.drop(['target', 'NP_Cya_bio'], axis=1))
y = np.array(ds3['target'])
y_reg = np.array(ds3['NP_Cya_bio']) #for regression


In [186]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X, y_reg, test_size=0.20)

In [187]:
#Scale the X's
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

In [188]:
#Linear regression
model = linear_model.LinearRegression()
model.fit(X_reg_train, y_reg_train)
#Testing
print(model.score(X_reg_test, y_reg_test))
print(model.coef_) # get theta coefficients (model params)

#CV - doesn't really work, too much variation
#r2 = cross_val_score(model, X, y_reg, scoring = 'r2', cv = 5, )
#print(r2)

0.37390644585703214
[ -336404.23193671  -399140.50445966 -1061222.48136934 46768878.00151783
   143567.13433896 12134061.64218822  9344216.85477284    98500.58594958
  1300100.67109838]


In [189]:
#Try again using other method? Yes, more info, p-values give feature significance!
import statsmodels.api as sm
# Fit regression model
model = sm.OLS(y_reg,X)
result = model.fit()
print(result.summary2())

                        Results: Ordinary least squares
Model:                  OLS              Adj. R-squared (uncentered): 0.440     
Dependent Variable:     y                AIC:                         56672.0654
Date:                   2020-05-07 00:31 BIC:                         56719.4605
No. Observations:       1431             Log-Likelihood:              -28327.   
Df Model:               9                F-statistic:                 125.7     
Df Residuals:           1422             Prob (F-statistic):          8.98e-174 
R-squared (uncentered): 0.443            Scale:                       9.2091e+15
---------------------------------------------------------------------------------
         Coef.         Std.Err.       t     P>|t|       [0.025          0.975]   
---------------------------------------------------------------------------------
x1      36588.6634    429892.5307   0.0851  0.9322    -806702.9886    879880.3155
x2    -962534.7944    660042.2441  -1.4583  0.145

In [190]:
#Basic logistic Regression
model = LogisticRegression(solver = "liblinear", penalty = 'l1', C = 1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

recall = metrics.recall_score(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
print('Recall:', recall)
print('ROC AUC:', auc)

scores = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(scores)
print(confusion_matrix)


Recall: 0.3333333333333333
ROC AUC: 0.6666666666666666
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       284
         1.0       1.00      0.33      0.50         3

    accuracy                           0.99       287
   macro avg       1.00      0.67      0.75       287
weighted avg       0.99      0.99      0.99       287

[[284   0]
 [  2   1]]


In [193]:
#ADD POLYNOMIAL FEATURES

In [180]:
#Logistic regression 
# - tune hyperparams to find the best model! Check out hw03
# - ADD POLYNOMIAL FEATURES - see LectureProject_1

#Ridge and Lasso
model = LogisticRegression(solver = "liblinear", max_iter = 1000)

#Use CV to find best parameters: 
best_estimators = []
distros = dict(C = np.logspace(-2, 12, 15), 
               class_weight = ['balanced', None], 
               penalty = ['l1', 'l2'])

search = RandomizedSearchCV(model, distros, scoring='roc_auc', refit='AUC', verbose=5, cv=5, n_iter=800, n_jobs=4, pre_dispatch='2*n_jobs')
search = search.fit(X_train, y_train)
best_estimators.append(search.best_estimator_)
best_estimators.append(search.best_score_)

#Elasticnet
model = LogisticRegression(solver = "saga", penalty = 'elasticnet', max_iter = 1000)
distros = dict(C = np.logspace(-2, 12, 15), 
               class_weight = ['balanced', None],
              l1_ratio = np.linspace(0, 1, 15))
search = RandomizedSearchCV(model, distros, scoring='roc_auc', refit='AUC', verbose=5, cv=5, n_iter=800, n_jobs=4, pre_dispatch='2*n_jobs')
search = search.fit(X_train, y_train)
best_estimators.append(search.best_estimator_)
best_estimators.append(search.best_score_)
best_estimators

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Done  72 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=4)]: Done  40 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 370 tasks      | elapsed:   26.5s
[Parallel(n_jobs=4)]: Done 460 tasks      | elapsed:   33.8s
[Parallel(n_jobs=4)]: Done 586 tasks      | elapsed:   48.4s
[Parallel(n_jobs=4)]: Done 748 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 946 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 1180 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 1756 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done 2098 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 2250 out of 2250 | elapsed:  4.2min finished


[LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 0.9593686181075561,
 LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=0.5714285714285714,
                    max_iter=1000, multi_class='warn', n_jobs=None,
                    penalty='elasticnet', random_state=None, solver='saga',
                    tol=0.0001, verbose=0, warm_start=False),
 0.9596638612125338]

In [192]:
#They do equally well
model1 = LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False)
model2 = LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=0.5714285714285714,
                    max_iter=1000, multi_class='warn', n_jobs=None,
                    penalty='elasticnet', random_state=None, solver='saga',
                    tol=0.0001, verbose=0, warm_start=False)

model.fit(X_train, y_train)

#Testing
y_pred = model.predict(X_test)
recall = metrics.recall_score(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
print('Recall:', recall)
print('ROC AUC:', auc)
scores = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(scores)
print(confusion_matrix)

Recall: 0.3333333333333333
ROC AUC: 0.6649061032863849
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       284
         1.0       0.50      0.33      0.40         3

    accuracy                           0.99       287
   macro avg       0.75      0.66      0.70       287
weighted avg       0.99      0.99      0.99       287

[[283   1]
 [  2   1]]




#### Things to try for logistic regression:
- ds2 vs ds3
- features scaled vs not
- use cross validation
- get performance summary with recall and ROC AUC and confusion matrix
- **add polynomial features**
- add regularization: try elastic net and/or Lasso (=L1?)
- use random search and cross validation to tune hyperparameters, find best model (specify scoring metric as recall/F1/AUC?):
    - C (inverse of regularization strength --> smaller value = more regularization)
    - solver (choice depends on choice of regularization)
    - penalty (form of regularization?)
    - l1_ratio: ratio between 0 and 1 passed for elasticnet