In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, precision_score

In [2]:
# Step 1: Load dataset
file_path = "voltage_data electric water heater.csv"  # Update with correct path if needed
df = pd.read_csv(file_path)

In [3]:
print(df.head())
print(df.info())


   Voltage
0   234.07
1   232.16
2   232.16
3   232.16
4   232.43
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Voltage  1000 non-null   float64
dtypes: float64(1)
memory usage: 7.9 KB
None


In [7]:
print(df.head())  # Should show both Voltage & Status columns
print(df["Status"].value_counts())

   Voltage  Status
0   234.07       1
1   232.16       1
2   232.16       1
3   232.16       1
4   232.43       1
1    566
0    434
Name: Status, dtype: int64


In [5]:
df["Status"] = df["Voltage"].apply(lambda x: 1 if x >= 230 else 0)
df.rename(columns={"Label": "Status"}, inplace=True)
print("Label Distribution Before Augmentation:\n", df['Status'].value_counts())

Label Distribution Before Augmentation:
 1    566
0    434
Name: Status, dtype: int64


In [6]:
# Step 4: Handle missing class (create synthetic high-voltage samples)
low_count = len(df[df["Status"] == 0])
high_count = len(df[df["Status"] == 1])

if high_count == 0:
    print("\u26A0 No high-voltage samples found. Generating synthetic data...")
    high_voltage_values = np.linspace(df["Voltage"].max() * 1.1, df["Voltage"].max() * 1.2, num=low_count)
    high_voltage_df = pd.DataFrame({'Voltage': high_voltage_values, 'Status': 1})
    df = pd.concat([df, high_voltage_df], ignore_index=True)

In [9]:
print("Label Distribution After Augmentation:\n", df['Status'].value_counts())

# Step 6: Prepare features (X) and labels (y)
X = df[['Voltage']].values  # Using Voltage as input feature
y = df['Status'].values  # Target variable

# Step 7: Normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)

# Step 8: Reduce Noise Injection (Balanced Noise)
np.random.seed(42)
X_noisy = X_scaled + np.random.normal(0, 0.005, X_scaled.shape)  # Lower noise impact

# Step 9: Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X_noisy, y, test_size=0.2, random_state=42, stratify=y)

# 🔹 Step 10: Train an Optimized Decision Tree Model
model = DecisionTreeClassifier(
    max_depth=8,              # Prevent overfitting by limiting depth
    min_samples_split=4,      # Ensure at least 4 samples to split
    min_samples_leaf=1,       # Allow single samples in leaves
    class_weight='balanced',  # Adjust weights for label imbalance
    random_state=42
)

model.fit(X_train, y_train)

# Step 11: Perform Cross-Validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Step 12: Make predictions
y_pred = model.predict(X_test)

# Step 13: Evaluate model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, zero_division=1)
precision = precision_score(y_test, y_pred, zero_division=1)

# Step 14: Display evaluation metrics
print("\nOptimized Decision Tree Model")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Accuracy: {accuracy:.4f}")  # Should improve over 75%+
print(f"F1-Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")

Label Distribution After Augmentation:
 1    566
0    434
Name: Status, dtype: int64
Cross-Validation Accuracy: 0.8150 ± 0.0094

Optimized Decision Tree Model
Mean Squared Error (MSE): 0.2300
Root Mean Squared Error (RMSE): 0.4796
Accuracy: 0.7700
F1-Score: 0.8000
Precision: 0.7863
