In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, r2_score

# Load datasets
file_paths = {
    "train_ammonia": "train_ammonia.csv",
    "train_BOD": "train_BOD.csv",
    "train_DOP": "train_DOP.csv",
    "train_phosphates": "train_phosphates.csv",
    "train_SS": "train_SS.csv",
    "test_ammonium": "test_ammonium.csv",
    "test_BOD": "test_BOD.csv",
    "test_DOP": "test_DOP.csv",
    "test_phosphates": "test_phosphates.csv",
    "test_SS": "test_SS.csv",
}

datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Merge all training datasets
train_data = []
for name in ["train_ammonia", "train_BOD", "train_DOP", "train_phosphates", "train_SS"]:
    train_df = datasets[name].drop(columns=["Id"], errors='ignore')
    train_data.append(train_df)

train_merged = pd.concat(train_data, axis=1)
train_merged = train_merged.loc[:, ~train_merged.columns.duplicated()]  # Remove duplicates

# Compute WQI using a weighted sum approach
weights = {
    "pH": 0.2, "BOD": 0.3, "DOP": 0.1, "phosphates": 0.2, "SS": 0.2
}  # Example weights, modify as needed

# Normalize individual parameters (assuming higher values = worse quality)
def normalize(series, ideal_range):
    return np.clip((series - ideal_range[0]) / (ideal_range[1] - ideal_range[0]) * 100, 0, 100)

train_merged["WQI"] = (
    normalize(train_merged["pH"], (6.5, 8.5)) * weights["pH"] +
    normalize(train_merged["BOD"], (0, 5)) * weights["BOD"] +
    normalize(train_merged["DOP"], (0, 0.1)) * weights["DOP"] +
    normalize(train_merged["phosphates"], (0, 0.5)) * weights["phosphates"] +
    normalize(train_merged["SS"], (0, 50)) * weights["SS"]
)

# Define features and target
y = train_merged["WQI"]
X = train_merged.drop(columns=["WQI"])

# Handle missing values using KNN imputation
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Feature Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Gradient Boosting Model on WQI
model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_val)

# Model Evaluation
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Gradient Boosting Model Performance on WQI:")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Plot Actual vs Predicted WQI
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred, alpha=0.7, color="blue", label="Predicted vs Actual")
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], color="red", linestyle="--", label="Perfect Fit")
plt.xlabel("Actual WQI")
plt.ylabel("Predicted WQI")
plt.title("Gradient Boosting Model - Water Quality Index Prediction")
plt.legend()
plt.grid(True)
plt.show()



KeyError: 'pH'