In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from common.Database import Database
import joblib



In [14]:
db = Database()
query = """
    select * 
    FROM habsos_j
    WHERE LATITUDE IS NOT NULL and LONGITUDE IS NOT NULL
    AND SAMPLE_DATE  IS NOT NULL
    and CATEGORY  is not NULL
    and SALINITY  is not NULL
    and WATER_TEMP is not null
    and WIND_DIR is not null 
    and WIND_SPEED is not null;
"""
records, columns = db.execute_query(query)
db.close()
df = pd.DataFrame(records, columns=columns)
df['date'] = pd.to_datetime(df['SAMPLE_DATETIME'])
df['month'] = df['date'].dt.month

class_mapping = {
    'not observed': 0,
    'very low': 1,
    'low': 2,
    'medium': 3,
    'high': 4
}

df['category_encoded'] = df['CATEGORY'].map(class_mapping)


In [15]:
features = ['LATITUDE', 'LONGITUDE', 'SALINITY', 'WATER_TEMP', 'WIND_DIR', 'WIND_SPEED']
X = df[features]
y = df['category_encoded']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest classifier
# 'balanced' mode uses the values of y to automatically adjust weights inversely proportional to class frequencies
rf = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=42)

# Fit the model on the training data
rf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test_scaled)



# Print the accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=list(class_mapping.keys())))


Accuracy: 0.8661784287616512
              precision    recall  f1-score   support

not observed       0.89      0.99      0.93      1274
    very low       0.27      0.07      0.12       107
         low       0.43      0.24      0.31        50
      medium       0.85      0.37      0.51        60
        high       0.75      0.27      0.40        11

    accuracy                           0.87      1502
   macro avg       0.64      0.39      0.45      1502
weighted avg       0.83      0.87      0.83      1502



In [16]:
# Save the scaler
scaler_filename = 'scaler.joblib'
joblib.dump(scaler, scaler_filename)
print("scaler is save")
# Save the model to a file
model_filename = 'random_forest_model.joblib'
joblib.dump(rf, model_filename)

print(f"Model saved to {model_filename}")

scaler is save
Model saved to random_forest_model.joblib
