## DATA COLLECTION


In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install xgboost
!pip install imbalanced-learn

import os
import pandas as pd
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Loading the datasets

In [None]:

# Had to make a copy of our folder called MLProjectLocal with only Datasets in it, otherwise it was saying file did not exist
#dataset_path = '/content/drive/MyDrive/MLProjectLocal/Datasets'
dataset_path = '/content/drive/MyDrive/MLProject/Datasets'


#List of all .csv files currently within Datasets
csv_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.csv')]

#Combining all of the stations from that one website as starting point
df_list = []
for file in csv_files:
    if 'station' in file:
      df_station = pd.read_csv(file)
      df_list.append(df_station)

df = pd.concat(df_list, ignore_index=True)
print(df.head())


    id  internal_station_id              collected_at  temperature_C  \
0  863                    6  2006-08-30T13:34:00.000Z           16.7   
1  864                    6  2006-09-27T16:02:00.000Z           17.8   
2  865                    6  2007-01-14T14:00:00.000Z            6.8   
3  866                    6  2007-02-27T12:10:00.000Z            1.6   
4  867                    6  2007-03-27T14:30:00.000Z            4.0   

   salinity  dissolved_oxygen_mg/L  chlorophyll_ug/L  pheophytin_ug/L  \
0      31.1                   8.73               NaN              NaN   
1      31.4                   9.84               NaN              NaN   
2      31.6                   9.79               NaN              NaN   
3      31.5                  12.08              6.59             0.01   
4      31.9                  11.49              1.35             0.15   

   turbidty_NTU  nitrate_nitrite_uM  ammonium_uM  ortho_phosphate_uM  \
0           NaN                0.14          NaN        

## BASELINE MODEL FOR PRELIM RESULTS


#### Model Development


In [None]:
df = df.drop(columns=["id", "internal_station_id", "collected_at"])

#drop rows where the oxygen is missing
if "dissolved_oxygen_mg/L" not in df.columns:
  print(1)
df = df.dropna(subset=["dissolved_oxygen_mg/L"])

#google told me 2ppm is when it gets hypoxic
df["hypoxic"] = (df["dissolved_oxygen_mg/L"] < 7).astype(int)

df = df.drop(columns=["dissolved_oxygen_mg/L"])

X = df.drop(columns=["hypoxic"])
y = df["hypoxic"]

print(len(y[y==1]))
#there are only 3 hypoxic events vs 1764 non hypoxic

#in another class we just put in the mean for missing data, we could also just omit
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

#train
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)

y_test_pred = xgb_model.predict(X_test)

print()
print("Classification Report for XGBoost")
print(classification_report(y_test, y_test_pred))


625


Parameters: { "use_label_encoder" } are not used.




Classification Report for XGBoost
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       357
           1       0.80      0.71      0.75       140

    accuracy                           0.87       497
   macro avg       0.85      0.82      0.83       497
weighted avg       0.87      0.87      0.87       497



#### Logistic Regression

In [None]:

lr_model = LogisticRegression(max_iter=5000) # max_iter set to 5000 because of convergence

lr_model.fit(X_train, y_train)

lr_expected = y_test
lr_pred = lr_model.predict(X_test)

print()
print("Classification Report for Logistic Regression")
print(classification_report(lr_expected, lr_pred))


Classification Report for Logistic Regression
              precision    recall  f1-score   support

           0       0.80      0.96      0.87       357
           1       0.77      0.39      0.51       140

    accuracy                           0.79       497
   macro avg       0.79      0.67      0.69       497
weighted avg       0.79      0.79      0.77       497



#### Comparing feature relevance between models

In [None]:
# XGBoost

#xgboost.plot_importance(model)

xgb_importance = xgb_model.get_booster().get_score(importance_type='gain')
xgb_df = pd.DataFrame({'Feature': list(xgb_importance.keys()), 'XGB_Importance': list(xgb_importance.values())})

feature_map = {f"f{i}": col for i, col in enumerate(X.columns)}
xgb_df['Feature'] = xgb_df['Feature'].map(feature_map)

top5_xgb = (
    xgb_df.sort_values(by='XGB_Importance', ascending=False)
    .head(5)
)

In [None]:
# Logistic Regression

lr_coefficients = lr_model.coef_[0]

lr_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_coefficients
})
lr_df['Absolute_Coefficient'] = lr_df['Coefficient'].abs()

top5_lr = (
    lr_df.sort_values(by='Absolute_Coefficient', ascending=False)
    [['Feature', 'Coefficient']]
    .head(5)
)

print("Logistic Regression")
print(top5_lr)
print()

print("XGBoost")
print(top5_xgb)


Logistic Regression
                            Feature  Coefficient
11      total_dissolved_nitrogen_uM     0.644311
9   particulate_organic_nitrogen_uM     0.509669
15           total_nitrogen_TDN_PON    -0.508414
12    total_dissolved_phosphorus_uM     0.384148
0                     temperature_C     0.149011

XGBoost
                            Feature  XGB_Importance
11      total_dissolved_nitrogen_uM       14.042201
0                     temperature_C        3.103285
6                       ammonium_uM        2.124978
9   particulate_organic_nitrogen_uM        1.701625
10    particulate_organic_carbon_uM        1.538726


#### Hyperparameter tuning


In [None]:

sm = SMOTE(random_state=42)

#implementing SMOTE on training data
X_res, y_res = sm.fit_resample(X_train, y_train)

#setting our models
model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
lr_model = LogisticRegression(max_iter=5000) # max_iter set to 5000 because of convergence

#getting results from SMOTE
model.fit(X_res, y_res)
lr_model.fit(X_res, y_res)

y_pred = model.predict(X_test)

lr_expected = y_test
lr_pred = lr_model.predict(X_test)

print()
print("Classification Report for XGBoost")
print(classification_report(y_test, y_pred))

print()
print("Classification Report for Logistic Regression")
print(classification_report(lr_expected, lr_pred))



Parameters: { "use_label_encoder" } are not used.




Classification Report for XGBoost
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       357
           1       0.77      0.75      0.76       140

    accuracy                           0.87       497
   macro avg       0.83      0.83      0.83       497
weighted avg       0.86      0.87      0.86       497


Classification Report for Logistic Regression
              precision    recall  f1-score   support

           0       0.90      0.80      0.84       357
           1       0.59      0.76      0.67       140

    accuracy                           0.79       497
   macro avg       0.75      0.78      0.76       497
weighted avg       0.81      0.79      0.79       497

