In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from common.Database import Database
import joblib

In [30]:
db = Database()
# pick only one region
query = """
   SELECT *
    FROM habsos_j
    WHERE LATITUDE IS NOT NULL 
    AND LONGITUDE IS NOT NULL
    AND CATEGORY IS NOT NULL
    AND SALINITY IS NOT NULL
    AND WATER_TEMP IS NOT NULL
    AND MONTH(SAMPLE_DATETIME) >= 8 
    AND MONTH(SAMPLE_DATETIME) <= 12
    AND YEAR(SAMPLE_DATETIME) >= 1955
    AND YEAR(SAMPLE_DATETIME) <= 2022
    AND LATITUDE BETWEEN 27.6 AND 27.95
    AND LONGITUDE BETWEEN -83.48 AND -82.11
"""
records, columns = db.execute_query(query)
db.close()
df = pd.DataFrame(records, columns=columns)
df['date'] = pd.to_datetime(df['SAMPLE_DATETIME'])
df['month'] = df['date'].dt.month

class_mapping = {
    'not observed': 0,
    'very low': 1,
    'low': 2,
    'medium': 3,
    'high': 4
}

df['category_encoded'] = df['CATEGORY'].map(class_mapping)

In [31]:
features = ['LATITUDE', 'LONGITUDE', 'SALINITY', 'WATER_TEMP']
X = df[features]
y = df['category_encoded']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest classifier
# 'balanced' mode uses the values of y to automatically adjust weights inversely proportional to class frequencies
rf = RandomForestClassifier(n_estimators=1000, class_weight='balanced', random_state=42)

# Fit the model on the training data
rf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test_scaled)



# Print the accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=list(class_mapping.keys())))

Accuracy: 0.794392523364486
              precision    recall  f1-score   support

not observed       0.85      0.96      0.90       916
    very low       0.33      0.20      0.25       106
         low       0.50      0.27      0.35        81
      medium       0.44      0.24      0.31        62
        high       0.25      0.17      0.20        12

    accuracy                           0.79      1177
   macro avg       0.48      0.37      0.40      1177
weighted avg       0.75      0.79      0.77      1177



In [32]:
# Save the scaler
scaler_filename = 'scaler-0425_region9-02.joblib'
joblib.dump(scaler, scaler_filename)
print("scaler is save")
# Save the model to a file
model_filename = 'random_forest_model-0425_region9-02.joblib'
joblib.dump(rf, model_filename)

print(f"Model saved to {model_filename}")

scaler is save
Model saved to random_forest_model-0425_region9-02.joblib


## use 2023 data to do another test

In [33]:
model_filename = model_filename
model = joblib.load(model_filename)
scaler_filename = scaler_filename
scaler = joblib.load(scaler_filename)

In [34]:

db = Database()

query = """
    select * 
    FROM habsos_j
    WHERE LATITUDE IS NOT NULL and LONGITUDE IS NOT NULL
    and CATEGORY  is not NULL
    and SALINITY  is not NULL
    and WATER_TEMP is not null
    and SAMPLE_DATETIME > '2023-01-01 00:00:00'
    and SAMPLE_DATETIME < '2023-12-31 23:59:59'
    and LATITUDE BETWEEN 27.6 AND 27.95
    and LONGITUDE BETWEEN -83.48 AND -82.11;
"""
records, columns = db.execute_query(query)
db.close()
df = pd.DataFrame(records, columns=columns)
df['date'] = pd.to_datetime(df['SAMPLE_DATETIME'])
df['month'] = df['date'].dt.month

class_mapping = {
    'not observed': 0,
    'very low': 1,
    'low': 2,
    'medium': 3,
    'high': 4
}

df['category_encoded'] = df['CATEGORY'].map(class_mapping)

In [35]:
features = ['LATITUDE', 'LONGITUDE', 'SALINITY', 'WATER_TEMP']
X = df[features]
y = df['category_encoded']

# Standardize the features
X_scaled = scaler.fit_transform(X)


# Make predictions on the test data
predictions = model.predict(X_scaled)
# print(y)
# print(y_pred)
# Print the accuracy and classification report
print("Accuracy:", accuracy_score(y, predictions))
print(classification_report(y, predictions, target_names=list(class_mapping.keys())))

Accuracy: 0.828752642706131
              precision    recall  f1-score   support

not observed       0.84      0.98      0.91       398
    very low       0.00      0.00      0.00        27
         low       0.00      0.00      0.00        27
      medium       0.00      0.00      0.00        20
        high       0.00      0.00      0.00         1

    accuracy                           0.83       473
   macro avg       0.17      0.20      0.18       473
weighted avg       0.71      0.83      0.76       473



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
