In [25]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [26]:
# Load processed dataset
df = pd.read_csv("../data/processed_data.csv")

In [27]:
# 1. Define Features and Target
X = df.drop(columns=["Need_Maintenance"])
y = df["Need_Maintenance"]

In [28]:
# 2. Scale Numerical Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [29]:
pd.DataFrame(X_scaled_df.columns).to_csv("../models/feature_columns.csv", index=False, header=False)


In [30]:
# 3. Balance the Target Variable Using SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled_df, y)

In [31]:
# 4. Combine Balanced Features and Target
balanced_df = pd.concat([pd.DataFrame(X_balanced, columns=X_scaled_df.columns), pd.Series(y_balanced, name="Need_Maintenance")], axis=1)

In [32]:
# 5. Save Balanced Data
balanced_df.to_csv("../data/balanced_data.csv", index=False)
print("Balanced data saved to 'data/balanced_data.csv'")

Balanced data saved to 'data/balanced_data.csv'
