In [71]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures

In [72]:
import pandas as pd

file_path = r"C:\Users\vishnuvarthan\Desktop\project\water\balanced_water_potability1.csv"
data = pd.read_csv(file_path, encoding='utf-8')

# Show the first 5 rows
print(data.head())

# Show all column names
print("\nColumns in dataset:", list(data.columns))



         ph    Hardness       Solids  Chloramines     Sulfate  Conductivity  \
0  7.080795  204.890456  20791.31898     7.300212  368.516441    564.308654   
1  8.099124  224.236259  19909.54173     9.275884  333.775777    418.606213   
2  8.316766  214.373394  22018.41744     8.059332  356.886136    363.266516   
3  9.092223  181.101509  17978.98634     6.546600  310.135738    398.410813   
4  5.584087  188.313324  28748.68774     7.544869  326.678363    280.467916   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       16.868637        66.420093   3.055934           0  
2       18.436525       100.341674   4.628771           0  
3       11.558279        31.997993   4.075075           0  
4        8.399735        54.917862   2.559708           0  

Columns in dataset: ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability']


In [73]:
data.shape

(3342, 10)

In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3342 entries, 0 to 3341
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               3342 non-null   float64
 1   Hardness         3342 non-null   float64
 2   Solids           3342 non-null   float64
 3   Chloramines      3342 non-null   float64
 4   Sulfate          3342 non-null   float64
 5   Conductivity     3342 non-null   float64
 6   Organic_carbon   3342 non-null   float64
 7   Trihalomethanes  3342 non-null   float64
 8   Turbidity        3342 non-null   float64
 9   Potability       3342 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 261.2 KB


In [75]:
data.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [76]:
count_target = data[next((col for col in data.columns if 'potability' in col.lower()), 'Potability')].value_counts()
print(count_target)


0    1671
1    1671
Name: Potability, dtype: int64


In [77]:
data['Log_Hardness'] = np.log1p(data['Hardness'])
X = data.drop('Potability', axis=1)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(X)
poly_feature_names = poly.get_feature_names_out(X.columns)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)


In [78]:
y = data['Potability']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(poly_df, y, test_size=0.3, random_state=42)

In [80]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [81]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('svc', SVC(probability=True, class_weight='balanced', random_state=42))
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier())

In [82]:
stacking_model.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=200,
                                                       random_state=42)),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric='logloss',
                                              feature_types=None, gamma=None,
                                              grow_po...
                    

In [83]:
y_pred_stacking = stacking_model.predict(X_test)


In [84]:
print("Stacking Model Accuracy:", accuracy_score(y_test, y_pred_stacking))
print("Confusion Matrix (Stacking):\n", confusion_matrix(y_test, y_pred_stacking))
print("Classification Report (Stacking):\n", classification_report(y_test, y_pred_stacking))

Stacking Model Accuracy: 0.6949152542372882
Confusion Matrix (Stacking):
 [[399 110]
 [196 298]]
Classification Report (Stacking):
               precision    recall  f1-score   support

           0       0.67      0.78      0.72       509
           1       0.73      0.60      0.66       494

    accuracy                           0.69      1003
   macro avg       0.70      0.69      0.69      1003
weighted avg       0.70      0.69      0.69      1003

