In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from common.Database import Database
import joblib

In [16]:
db = Database()
query = """
    select * 
    FROM habsos_j
    WHERE LATITUDE IS NOT NULL and LONGITUDE IS NOT NULL
    and CATEGORY  is not NULL
    and SALINITY  is not NULL
    and WATER_TEMP is not null
    and SAMPLE_DATETIME < '2023-01-01 00:00:00';
"""
records, columns = db.execute_query(query)
db.close()
df = pd.DataFrame(records, columns=columns)
df['date'] = pd.to_datetime(df['SAMPLE_DATETIME'])
df['month'] = df['date'].dt.month

class_mapping = {
    'not observed': 0,
    'very low': 1,
    'low': 2,
    'medium': 3,
    'high': 4
}

df['category_encoded'] = df['CATEGORY'].map(class_mapping)

In [17]:
features = ['LATITUDE', 'LONGITUDE', 'SALINITY', 'WATER_TEMP']
X = df[features]
y = df['category_encoded']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest classifier
# 'balanced' mode uses the values of y to automatically adjust weights inversely proportional to class frequencies
rf = RandomForestClassifier(n_estimators=800, class_weight='balanced', random_state=42)

# Fit the model on the training data
rf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test_scaled)



# Print the accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=list(class_mapping.keys())))

Accuracy: 0.8084755090809026
              precision    recall  f1-score   support

not observed       0.86      0.95      0.90     14779
    very low       0.33      0.14      0.19      1457
         low       0.33      0.17      0.23       975
      medium       0.34      0.23      0.28       743
        high       0.35      0.16      0.22       216

    accuracy                           0.81     18170
   macro avg       0.44      0.33      0.36     18170
weighted avg       0.76      0.81      0.78     18170



In [18]:
# Save the scaler
scaler_filename = 'scaler-0424.joblib'
joblib.dump(scaler, scaler_filename)
print("scaler is save")
# Save the model to a file
model_filename = 'random_forest_model-0424.joblib'
joblib.dump(rf, model_filename)

print(f"Model saved to {model_filename}")

scaler is save
Model saved to random_forest_model-0424.joblib


## use 2023 data to do another test

In [19]:
model_filename = 'random_forest_model-0424.joblib'
model = joblib.load(model_filename)
scaler_filename = 'scaler-0424.joblib'
scaler = joblib.load(scaler_filename)

In [20]:

db = Database()

query = """
    select * 
    FROM habsos_j
    WHERE LATITUDE IS NOT NULL and LONGITUDE IS NOT NULL
    and CATEGORY  is not NULL
    and SALINITY  is not NULL
    and WATER_TEMP is not null
    and SAMPLE_DATETIME > '2023-01-01 00:00:00'
    and SAMPLE_DATETIME < '2023-12-31 23:59:59';
"""
records, columns = db.execute_query(query)
db.close()
df = pd.DataFrame(records, columns=columns)
df['date'] = pd.to_datetime(df['SAMPLE_DATETIME'])
df['month'] = df['date'].dt.month

class_mapping = {
    'not observed': 0,
    'very low': 1,
    'low': 2,
    'medium': 3,
    'high': 4
}

df['category_encoded'] = df['CATEGORY'].map(class_mapping)

In [21]:
features = ['LATITUDE', 'LONGITUDE', 'SALINITY', 'WATER_TEMP']
X = df[features]
y = df['category_encoded']

# Standardize the features
X_scaled = scaler.fit_transform(X)


# Make predictions on the test data
predictions = model.predict(X_scaled)
# print(y)
# print(y_pred)
# Print the accuracy and classification report
print("Accuracy:", accuracy_score(y, predictions))
print(classification_report(y, predictions, target_names=list(class_mapping.keys())))

Accuracy: 0.8331409727947239
              precision    recall  f1-score   support

not observed       0.85      0.98      0.91      5152
    very low       0.15      0.01      0.02       441
         low       0.00      0.00      0.00       243
      medium       0.07      0.02      0.03       185
        high       0.00      0.00      0.00        44

    accuracy                           0.83      6065
   macro avg       0.21      0.20      0.19      6065
weighted avg       0.73      0.83      0.77      6065

