In [1]:
import pandas as pd
import numpy as np

In [163]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib


## Data Cleaning

### Top Data

In [3]:
top_df = pd.read_csv('raw_data_topen_full.csv')
top_df.head()

Unnamed: 0,pond,date,sampling_time,total_coliform,temp_celsius(air),ORP_mV,turbidity_FAU,e_coli_MPN_100_mL,enterococci,temp_celsius(water),conductivity_mu_S,pH
0,1,2/5/2014,9:40 AM,1.0,23.9,192.0,11.0,1.0,37.2,20.7,141.0,7.04
1,1,2/12/2014,9:15 AM,26.6,17.4,179.0,10.0,1.0,1046.2,17.9,154.0,7.18
2,1,2/13/2014,10:00 AM,111.9,14.9,96.0,7.0,2.0,1299.7,17.5,124.0,6.93
3,1,2/19/2014,9:00 AM,4640.0,19.3,123.0,12.0,2.0,50.5,18.1,138.0,7.38
4,1,2/26/2014,9:15 AM,2.0,16.8,195.0,11.0,1.0,478.6,21.1,161.0,8.09


In [4]:
top_df.columns = ["pond", "date", "sampling_time", "total_coliform", "air_temp", "ORP_mv", "turbidity", "ecoli_mpn", "enterococci", "water_temp", "conductivity", "ph"]
top_df.head()

Unnamed: 0,pond,date,sampling_time,total_coliform,air_temp,ORP_mv,turbidity,ecoli_mpn,enterococci,water_temp,conductivity,ph
0,1,2/5/2014,9:40 AM,1.0,23.9,192.0,11.0,1.0,37.2,20.7,141.0,7.04
1,1,2/12/2014,9:15 AM,26.6,17.4,179.0,10.0,1.0,1046.2,17.9,154.0,7.18
2,1,2/13/2014,10:00 AM,111.9,14.9,96.0,7.0,2.0,1299.7,17.5,124.0,6.93
3,1,2/19/2014,9:00 AM,4640.0,19.3,123.0,12.0,2.0,50.5,18.1,138.0,7.38
4,1,2/26/2014,9:15 AM,2.0,16.8,195.0,11.0,1.0,478.6,21.1,161.0,8.09


In [5]:
top_psysico_df = top_df[["ecoli_mpn", "ph", "conductivity", "turbidity", "water_temp"]]

In [6]:
top_psysico_df.head()

Unnamed: 0,ecoli_mpn,ph,conductivity,turbidity,water_temp
0,1.0,7.04,141.0,11.0,20.7
1,1.0,7.18,154.0,10.0,17.9
2,2.0,6.93,124.0,7.0,17.5
3,2.0,7.38,138.0,12.0,18.1
4,1.0,8.09,161.0,11.0,21.1


In [8]:
top_psysico_df.loc[:, ('conductivity_log_us_cm')] = np.log10(top_psysico_df['conductivity'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_psysico_df.loc[:, ('conductivity_log_us_cm')] = np.log10(top_psysico_df['conductivity'])


In [10]:
top_psysico_df.loc[:, ('turbidity_log')] = np.log10(top_psysico_df['turbidity'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_psysico_df.loc[:, ('turbidity_log')] = np.log10(top_psysico_df['turbidity'])


In [12]:
top_psysico_df.loc[:, ('ecoli_log_mpn')] = np.log10(top_psysico_df['ecoli_mpn'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_psysico_df.loc[:, ('ecoli_log_mpn')] = np.log10(top_psysico_df['ecoli_mpn'])


In [13]:
top_psysico_df.head()

Unnamed: 0,ecoli_mpn,ph,conductivity,turbidity,water_temp,conductivity_log_us_cm,turbidity_log,ecoli_log_mpn
0,1.0,7.04,141.0,11.0,20.7,2.149219,1.041393,0.0
1,1.0,7.18,154.0,10.0,17.9,2.187521,1.0,0.0
2,2.0,6.93,124.0,7.0,17.5,2.093422,0.845098,0.30103
3,2.0,7.38,138.0,12.0,18.1,2.139879,1.079181,0.30103
4,1.0,8.09,161.0,11.0,21.1,2.206826,1.041393,0.0


In [14]:
top_final_psysico_df = top_psysico_df[["ecoli_log_mpn", "ph", "conductivity_log_us_cm", "turbidity_log", "water_temp"]]
top_final_psysico_df.head()

Unnamed: 0,ecoli_log_mpn,ph,conductivity_log_us_cm,turbidity_log,water_temp
0,0.0,7.04,2.149219,1.041393,20.7
1,0.0,7.18,2.187521,1.0,17.9
2,0.30103,6.93,2.093422,0.845098,17.5
3,0.30103,7.38,2.139879,1.079181,18.1
4,0.0,8.09,2.206826,1.041393,21.1


### Weller Data

In [15]:
weller_df = pd.read_csv('weller_mstdata.csv')
weller_final_psysico_df = weller_df[["ecoli", "ph", "cond", "turb", "w_t"]]
weller_final_psysico_df.head()

Unnamed: 0,ecoli,ph,cond,turb,w_t
0,2.078819,7.83,2.815578,0.906335,19.2
1,3.39794,,2.781755,1.075547,21.1
2,3.39794,7.54,2.716003,1.025306,18.1
3,2.588047,8.49,2.83123,0.64836,23.0
4,1.779596,8.52,2.745855,0.598791,23.7


In [16]:
weller_final_psysico_df.columns = ["ecoli_log_mpn", "ph", "conductivity_log_us_cm", "turbidity_log", "water_temp"]

In [17]:
weller_final_psysico_df.head()

Unnamed: 0,ecoli_log_mpn,ph,conductivity_log_us_cm,turbidity_log,water_temp
0,2.078819,7.83,2.815578,0.906335,19.2
1,3.39794,,2.781755,1.075547,21.1
2,3.39794,7.54,2.716003,1.025306,18.1
3,2.588047,8.49,2.83123,0.64836,23.0
4,1.779596,8.52,2.745855,0.598791,23.7


### Wiley Data

In [18]:
wiley_df = pd.read_csv('raw_data_wiley.csv')
wiley_df.head()

Unnamed: 0,e_coli,contamination(%),water_temperature(◦C),conductivity(µS),pH,turbidity(NTU),e_coli_MG1655_load(LogCFU/mL)
0,0,0,24.82,70.0,8.77,7.65,0.52
1,0,0,23.32,8.1,7.11,9.47,0.52
2,0,0,23.14,96.1,8.31,8.24,0.52
3,0,0,25.8,26.3,7.65,11.67,0.52
4,0,0,23.07,7.9,7.36,15.92,0.52


In [20]:
wiley_df.columns = ["ecoli_presence", "contamination_percent", "water_temp", "conductivity", "ph", "turbidity", "ecoli_log_mpn"]
wiley_df.head()

Unnamed: 0,ecoli_presence,contamination_percent,water_temp,conductivity,ph,turbidity,ecoli_log_mpn
0,0,0,24.82,70.0,8.77,7.65,0.52
1,0,0,23.32,8.1,7.11,9.47,0.52
2,0,0,23.14,96.1,8.31,8.24,0.52
3,0,0,25.8,26.3,7.65,11.67,0.52
4,0,0,23.07,7.9,7.36,15.92,0.52


In [27]:
wiley_psysico_df = wiley_df[["ecoli_presence", "ecoli_log_mpn", "ph", "conductivity", "turbidity", "water_temp"]]

In [28]:
wiley_psysico_df.dtypes

ecoli_presence      int64
ecoli_log_mpn     float64
ph                float64
conductivity      float64
turbidity         float64
water_temp        float64
dtype: object

In [29]:
wiley_psysico_df.loc[:, ('conductivity_log_us_cm')] = np.log10(wiley_psysico_df['conductivity'])

In [30]:
wiley_psysico_df.loc[:, ('turbidity_log')] = np.log10(wiley_psysico_df['turbidity'])

In [31]:
wiley_psysico_df.head()

Unnamed: 0,ecoli_presence,ecoli_log_mpn,ph,conductivity,turbidity,water_temp,conductivity_log_us_cm,turbidity_log
0,0,0.52,8.77,70.0,7.65,24.82,1.845098,0.883661
1,0,0.52,7.11,8.1,9.47,23.32,0.908485,0.97635
2,0,0.52,8.31,96.1,8.24,23.14,1.982723,0.915927
3,0,0.52,7.65,26.3,11.67,25.8,1.419956,1.067071
4,0,0.52,7.36,7.9,15.92,23.07,0.897627,1.201943


## Training Data

In [32]:
training_data = pd.concat([top_final_psysico_df, weller_final_psysico_df], ignore_index=True)

# Handle missing values by replacing with feature-specific means
# combined_data.fillna(combined_data.mean(), inplace=True)

In [33]:
training_data.head()

Unnamed: 0,ecoli_log_mpn,ph,conductivity_log_us_cm,turbidity_log,water_temp
0,0.0,7.04,2.149219,1.041393,20.7
1,0.0,7.18,2.187521,1.0,17.9
2,0.30103,6.93,2.093422,0.845098,17.5
3,0.30103,7.38,2.139879,1.079181,18.1
4,0.0,8.09,2.206826,1.041393,21.1


In [34]:
null_counts = training_data.isnull().sum()

In [35]:
null_counts

ecoli_log_mpn             2
ph                        3
conductivity_log_us_cm    0
turbidity_log             1
water_temp                0
dtype: int64

In [36]:
final_training_data = training_data.dropna()

In [37]:
final_training_data.count()

ecoli_log_mpn             730
ph                        730
conductivity_log_us_cm    730
turbidity_log             730
water_temp                730
dtype: int64

In [38]:
final_training_data.describe()

Unnamed: 0,ecoli_log_mpn,ph,conductivity_log_us_cm,turbidity_log,water_temp
count,730.0,730.0,730.0,730.0,730.0
mean,1.247784,8.318918,2.455539,1.246792,21.250493
std,1.023423,0.873886,0.291052,0.427037,4.329668
min,0.0,5.08,0.863323,-0.167491,4.95
25%,0.30103,7.69,2.232996,1.025306,18.425
50%,1.08636,8.23,2.460145,1.342423,21.3
75%,2.10619,8.91,2.584331,1.531479,24.075
max,4.175222,10.71,3.514548,2.164353,31.0


In [41]:
# final_training_data['ecoli_presence'] = final_training_data['ecoli_log_mpn'].apply(lambda x: 0 if x < 1 else 1)
final_training_data.loc[:, 'ecoli_presence'] = final_training_data['ecoli_log_mpn'].apply(lambda x: 0 if x < 1 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_training_data.loc[:, 'ecoli_presence'] = final_training_data['ecoli_log_mpn'].apply(lambda x: 0 if x < 1 else 1)


In [44]:
final_training_data

Unnamed: 0,ecoli_log_mpn,ph,conductivity_log_us_cm,turbidity_log,water_temp,ecoli_presence
0,0.000000,7.04,2.149219,1.041393,20.7,0
1,0.000000,7.18,2.187521,1.000000,17.9,0
2,0.301030,6.93,2.093422,0.845098,17.5,0
3,0.301030,7.38,2.139879,1.079181,18.1,0
4,0.000000,8.09,2.206826,1.041393,21.1,0
...,...,...,...,...,...,...
731,2.304059,7.77,2.969416,0.510545,13.6,1
732,2.310906,7.88,2.937518,0.700704,18.8,1
733,1.843233,7.61,3.075547,0.064458,13.2,1
734,2.415474,7.33,3.107888,0.334454,12.3,1


## Testing Data

In [158]:
testing_data =  wiley_psysico_df[["ecoli_log_mpn", "ph", "conductivity_log_us_cm", "turbidity_log", "water_temp", "ecoli_presence"]]


In [159]:
testing_data.head()

Unnamed: 0,ecoli_log_mpn,ph,conductivity_log_us_cm,turbidity_log,water_temp,ecoli_presence
0,0.52,8.77,1.845098,0.883661,24.82,0
1,0.52,7.11,0.908485,0.97635,23.32,0
2,0.52,8.31,1.982723,0.915927,23.14,0
3,0.52,7.65,1.419956,1.067071,25.8,0
4,0.52,7.36,0.897627,1.201943,23.07,0


In [160]:
X_test = testing_data[["ph", "conductivity_log_us_cm", "turbidity_log"]]
y_test = testing_data[["ecoli_presence"]]

In [161]:
X_test_df = pd.DataFrame(X_test, columns=["ph", "conductivity_log_us_cm", "turbidity_log"])

## Data Processing

In [141]:
X = final_training_data[["ph", "conductivity_log_us_cm", "turbidity_log"]]
y = final_training_data["ecoli_presence"]

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [142]:
scaler = StandardScaler()
sc_X = scaler.fit_transform(X)

## Synthetic Minority Oversampling Technique (SMOTE) && Random Forest Regressor Model

In [143]:
# Define oversampling techniques
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

In [144]:
# Apply SMOTE to the training data
X_train, y_train = smote.fit_resample(sc_X, y)

In [145]:
X_train_df = pd.DataFrame(X_train, columns=["ph", "conductivity_log_us_cm", "turbidity_log"])

In [146]:
rf = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 2})

In [147]:
pipe_rf = Pipeline([('rf', rf)])


In [148]:
param_grid_rf = {'rf__n_estimators': [300, 500, 600, 700, 1000], 'rf__max_features': [1,2,3,4]}


In [149]:
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=10, scoring='accuracy')

In [150]:
grid_rf.fit(X_train_df, y_train)

In [151]:
# Evaluate models on test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"Accuracy: {acc:.2f}")
    print(f"AUC: {roc_auc:.2f}")
    print(classification_report(y_test, y_pred))

In [162]:
print("Random Forest:")
evaluate_model(grid_rf, X_test_df, y_test)

Random Forest:
Accuracy: 0.80
AUC: 0.82
              precision    recall  f1-score   support

           0       0.94      0.71      0.81        24
           1       0.68      0.94      0.79        16

    accuracy                           0.80        40
   macro avg       0.81      0.82      0.80        40
weighted avg       0.84      0.80      0.80        40



In [164]:
joblib.dump(grid_rf, 'random_forest_model.pkl')

['random_forest_model.pkl']

## Msx

In [119]:
# Define classifiers
# rf = RandomForestRegressor(random_state=42)
# svm = SVC(random_state=42)
# blr = LogisticRegression(random_state=42)
# blr_2 = LogisticRegression(random_state=42)

In [118]:
# Define pipelines for oversampling + model
# pipe_srf = Pipeline([('smote', smote), ('scaler', StandardScaler()), ('rf_2', rf_2)])  # Example with scaling for RF

In [120]:
# Define pipelines for oversampling + model
# pipe_blr = Pipeline([('smote', smote), ('blr', blr)])
# pipe_sblr = Pipeline([('smote', smote), ('scaler', StandardScaler()), ('blr_2', blr_2)])  # Example with scaling for BLR

In [128]:
# Define parameter grids for hyperparameter tuning


In [123]:
# Define parameter grids for hyperparameter tuning
param_grid_blr = {...}  # Define hyperparameters for BLR


In [129]:
# Perform GridSearchCV for each model
grid_srf = GridSearchCV(pipe_srf, param_grid_rf, cv=10, scoring='accuracy')


In [None]:
grid_blr = GridSearchCV(pipe_blr, param_grid_blr, cv=10, scoring='accuracy')
grid_sblr = GridSearchCV(pipe_sblr, param_grid_blr, cv=10, scoring='accuracy')

In [None]:
# Fit the models
grid_srf.fit(X_train, y_train)


In [None]:
grid_blr.fit(X_train, y_train)
grid_sblr.fit(X_train, y_train)

In [None]:
print("Random Forest (Scaled):")
evaluate_model(grid_srf, X_test, y_test)

In [None]:
print("Binary Linear Regressor:")
evaluate_model(grid_blr, X_test, y_test)

In [None]:
print("Random Forest:")
evaluate_model(grid_sblr, X_test, y_test)