In [19]:
import sys
import os


from common.Database import Database

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.preprocessing import StandardScaler


In [20]:
db = Database()
query = """
    select * 
    FROM habsos_j
    WHERE LATITUDE IS NOT NULL 
    AND LONGITUDE IS NOT NULL
    AND CATEGORY IS NOT NULL
    AND SALINITY IS NOT NULL
    AND WATER_TEMP IS NOT NULL
    AND MONTH(SAMPLE_DATETIME) >= 8 
    AND MONTH(SAMPLE_DATETIME) <= 12
    AND YEAR(SAMPLE_DATETIME) >= 1955
    AND YEAR(SAMPLE_DATETIME) <= 2022
    AND LATITUDE BETWEEN 27.6 AND 27.95
    AND LONGITUDE BETWEEN -83.48 AND -82.11
    and WIND_DIR is not null 
    and WIND_SPEED is not null;
"""
records, columns = db.execute_query(query)
db.close()
df = pd.DataFrame(records, columns=columns)
df['date'] = pd.to_datetime(df['SAMPLE_DATETIME'])
df['month'] = df['date'].dt.month
class_mapping = {
    'not observed': 0,
    'very low': 1,
    'low': 2,
    'medium': 3,
    'high': 4
}

df['category_encoded'] = df['CATEGORY'].map(class_mapping)







features = ['LATITUDE', 'LONGITUDE', 'SALINITY', 'WATER_TEMP', 'WIND_DIR', 'WIND_SPEED']
'''
  example data:
  SALINITY: 32.00
  WATER_TEMP: 28
  WIN_DIR: 135
  WIND_SPEED: 5
'''


X = df[features]
y = df['category_encoded']



features_to_scale = ['SALINITY', 'WATER_TEMP', 'WIND_DIR', 'WIND_SPEED']
X_to_scale = X[features_to_scale]



X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=8000, class_weight='balanced') #this seem good



model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Print accuracy and classification report using original labels
print("Accuracy:", accuracy_score(y_test, y_pred))
# Assuming y_test and y_pred are your test labels and predictions, respectively
print("Unique classes in y_test:", np.unique(y_test))
print("Unique classes in y_pred:", np.unique(y_pred))
print("Provided target names:", list(class_mapping.keys()))

class_mapping_inverse = {v: k for k, v in class_mapping.items()}

# Ensure all potential classes are accounted for in the report
all_classes = np.unique(np.concatenate((y_test, y_pred)))
class_names = [class_mapping_inverse[c] for c in all_classes]  # Use the reverse mapping to get class names

print("Adjusted Classification Report:")
print(classification_report(y_test, y_pred, labels=all_classes, target_names=class_names))
# print(classification_report(y_test, y_pred, target_names=list(class_mapping.keys())))



Accuracy: 0.15555555555555556
Unique classes in y_test: [0 1 2 3 4]
Unique classes in y_pred: [0 1 2 3 4]
Provided target names: ['not observed', 'very low', 'low', 'medium', 'high']
Adjusted Classification Report:
              precision    recall  f1-score   support

not observed       0.93      0.12      0.21       110
    very low       0.00      0.00      0.00         8
         low       0.10      0.33      0.16         9
      medium       0.08      0.50      0.13         6
        high       0.05      1.00      0.10         2

    accuracy                           0.16       135
   macro avg       0.23      0.39      0.12       135
weighted avg       0.77      0.16      0.19       135

