In [1]:
# import libs
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Reading the data
df=pd.read_csv('../Dataset/Processed_data.csv')

In [3]:
df.columns

Index(['Timestamp', 'Electricity_Consumed', 'Temperature', 'Humidity',
       'Wind_Speed', 'Anomaly_Label', 'Avg_of_past12'],
      dtype='object')

In [11]:
# Drop columns that aren't useful for training
# (timestamps and date strings don't help the model)
X = df.drop(["Anomaly_Label", "Timestamp"], axis=1)
y = df["Anomaly_Label"]

# Always check imbalance
print("Class distribution:")
print(y.value_counts())

# Stratified train-test split because anomalies are rare
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


# Random Forest tuned for imbalanced data
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    class_weight="balanced",
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Optional: probabilities for threshold tuning
y_prob = model.predict_proba(X_test)[:, 1]


Class distribution:
Anomaly_Label
0    4750
1     250
Name: count, dtype: int64

Confusion Matrix:
[[934  11]
 [ 28  27]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       945
           1       0.71      0.49      0.58        55

    accuracy                           0.96      1000
   macro avg       0.84      0.74      0.78      1000
weighted avg       0.96      0.96      0.96      1000

