In [1]:
# import libs
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
# Reading the data
df=pd.read_csv('../Dataset/Processed_data.csv')

In [3]:
df.columns

Index(['Timestamp', 'Electricity_Consumed', 'Temperature', 'Humidity',
       'Wind_Speed', 'Anomaly_Label', 'Avg_of_past12'],
      dtype='object')

In [4]:
# --- 2. Prepare features (X) and target (y) ---
# Drop the 'Timestamp' column and target label for training
X = df.drop(["Anomaly_Label", "Timestamp"], axis=1)
y = df["Anomaly_Label"]

# --- 3. Check the class distribution (imbalance) ---
print("Class distribution before SMOTE:")
print(y.value_counts())

# --- 4. Scale features ---
# Scaling is important for SMOTE and improves model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 5. Handle class imbalance using SMOTE ---
# SMOTE generates synthetic samples for the minority class (anomalies)
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_scaled, y)

print("Class distribution after SMOTE:")
print(pd.Series(y_res).value_counts())

# --- 6. Split the resampled data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, shuffle=True, random_state=42
)

# --- 7. Initialize and train the Random Forest classifier ---
# class_weight='balanced' helps the model handle any remaining imbalance
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    class_weight="balanced",
    random_state=42
)
model.fit(X_train, y_train)

# --- 8. Predict probabilities on the test set ---
y_prob = model.predict_proba(X_test)[:, 1]  # probability of being anomaly

# --- 9. Apply threshold to determine final predictions ---
# Default threshold = 0.5; can adjust (e.g., 0.3) to increase anomaly recall
threshold = 0.5
y_pred = (y_prob >= threshold).astype(int)

# --- 10. Save the trained model and scaler for future use ---
joblib.dump(model, "../Models/rf_model.pkl")
joblib.dump(scaler, "../Models/scaler.pkl")
print("Model and scaler saved successfully!")

# --- 11. Evaluate the model ---
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Class distribution before SMOTE:
Anomaly_Label
0    4750
1     250
Name: count, dtype: int64
Class distribution after SMOTE:
Anomaly_Label
0    4750
1    4750
Name: count, dtype: int64
Model and scaler saved successfully!

Confusion Matrix:
[[923  44]
 [ 15 918]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       967
           1       0.95      0.98      0.97       933

    accuracy                           0.97      1900
   macro avg       0.97      0.97      0.97      1900
weighted avg       0.97      0.97      0.97      1900

