In [None]:

import numpy as np
import pandas as pd


import os
for dirname, _, filenames in os.walk('/content/water_potability (1).csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV

df = pd.read_csv('/content/water_potability (1).csv')
df = df.fillna(df.mean())

X = df.drop('Potability', axis=1)
Y = df['Potability']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# define models and parameters
model = DecisionTreeClassifier()
criterion = ["gini", "entropy"]
splitter = ["best", "random"]
min_samples_split = [2,4,6,8,10]

# define grid search
grid = dict(splitter=splitter, criterion=criterion, min_samples_split=min_samples_split)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search_dt = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='accuracy',error_score=0)

grid_search_dt.fit(X_train, Y_train)

# summarize results
print(f"Best: {grid_search_dt.best_score_:.3f} using {grid_search_dt.best_params_}")
means = grid_search_dt.cv_results_['mean_test_score']
stds = grid_search_dt.cv_results_['std_test_score']
params = grid_search_dt.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print(f"{mean:.3f} ({stdev:.3f}) with: {param}")

print("Training Score:",grid_search_dt.score(X_train, Y_train)*100)
print("Testing Score:", grid_search_dt.score(X_test, Y_test)*100)

Best: 0.592 using {'criterion': 'gini', 'min_samples_split': 10, 'splitter': 'random'}
0.587 (0.029) with: {'criterion': 'gini', 'min_samples_split': 2, 'splitter': 'best'}
0.579 (0.026) with: {'criterion': 'gini', 'min_samples_split': 2, 'splitter': 'random'}
0.587 (0.030) with: {'criterion': 'gini', 'min_samples_split': 4, 'splitter': 'best'}
0.580 (0.028) with: {'criterion': 'gini', 'min_samples_split': 4, 'splitter': 'random'}
0.589 (0.033) with: {'criterion': 'gini', 'min_samples_split': 6, 'splitter': 'best'}
0.584 (0.030) with: {'criterion': 'gini', 'min_samples_split': 6, 'splitter': 'random'}
0.588 (0.033) with: {'criterion': 'gini', 'min_samples_split': 8, 'splitter': 'best'}
0.587 (0.031) with: {'criterion': 'gini', 'min_samples_split': 8, 'splitter': 'random'}
0.585 (0.032) with: {'criterion': 'gini', 'min_samples_split': 10, 'splitter': 'best'}
0.592 (0.030) with: {'criterion': 'gini', 'min_samples_split': 10, 'splitter': 'random'}
0.586 (0.028) with: {'criterion': 'entrop

In [None]:

from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import  make_scorer
from sklearn.model_selection import cross_val_score
def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score
nested_score = cross_val_score(grid_search_dt, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
nested_score

              precision    recall  f1-score   support

           0       0.69      0.73      0.71       159
           1       0.54      0.49      0.51       103

    accuracy                           0.63       262
   macro avg       0.61      0.61      0.61       262
weighted avg       0.63      0.63      0.63       262

              precision    recall  f1-score   support

           0       0.66      0.67      0.66       159
           1       0.48      0.47      0.47       103

    accuracy                           0.59       262
   macro avg       0.57      0.57      0.57       262
weighted avg       0.59      0.59      0.59       262

              precision    recall  f1-score   support

           0       0.67      0.64      0.65       159
           1       0.48      0.50      0.49       103

    accuracy                           0.59       262
   macro avg       0.57      0.57      0.57       262
weighted avg       0.59      0.59      0.59       262

              preci

array([0.63358779, 0.58778626, 0.58778626, 0.58778626, 0.60305344,
       0.59160305, 0.53435115, 0.59923664, 0.54961832, 0.5648855 ])

In [None]:
hpt_y_predicted = grid_search_dt.predict(X_test)
hpt_y_predicted

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,

In [None]:


hpt_accuracy = accuracy_score(Y_test, hpt_y_predicted)
print("Accuracy:", hpt_accuracy)
print("Classification Report:")
print(classification_report(Y_test, hpt_y_predicted))

Accuracy: 0.6189024390243902
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.69      0.69       412
           1       0.49      0.50      0.50       244

    accuracy                           0.62       656
   macro avg       0.59      0.60      0.59       656
weighted avg       0.62      0.62      0.62       656



In [None]:

print("\n--- Hyperparameter Tuning Results ---")
print(f"Best accuracy achieved during CV: {grid_search_dt.best_score_:.3f}")
print(f"Best parameters found: {grid_search_dt.best_params_}")
print("\n--- Performance on Training and Test Sets using Best Model ---")
print("Training Accuracy:", grid_search_dt.score(X_train, Y_train)*100)
print("Testing Accuracy:", grid_search_dt.score(X_test, Y_test)*100)
print("\n--- Classification Report on Test Set ---")
print(classification_report(Y_test, hpt_y_predicted))
print("\n--- Accuracy Score on Test Set ---")
print("Accuracy:", hpt_accuracy)


--- Hyperparameter Tuning Results ---
Best accuracy achieved during CV: 0.592
Best parameters found: {'criterion': 'gini', 'min_samples_split': 10, 'splitter': 'random'}

--- Performance on Training and Test Sets using Best Model ---
Training Accuracy: 85.34351145038168
Testing Accuracy: 61.890243902439025

--- Classification Report on Test Set ---
              precision    recall  f1-score   support

           0       0.70      0.69      0.69       412
           1       0.49      0.50      0.50       244

    accuracy                           0.62       656
   macro avg       0.59      0.60      0.59       656
weighted avg       0.62      0.62      0.62       656


--- Accuracy Score on Test Set ---
Accuracy: 0.6189024390243902


In [None]:

import pandas as pd

new_data_point = {
    'ph': [7.0],
    'Hardness': [180.0],
    'Solids': [20000.0],
    'Chloramines': [7.0],
    'Sulfate': [350.0],
    'Conductivity': [400.0],
    'Organic_carbon': [15.0],
    'Trihalomethanes': [70.0],
    'Turbidity': [4.0]
}
new_data = pd.DataFrame(new_data_point)

# Make predictions on the new data
predictions = grid_search_dt.predict(new_data)

print("\n--- Prediction on new data ---")
print("Input Data:")
print(new_data)
print("\nPredicted Potability (0: Not Potable, 1: Potable):")
predictions



--- Prediction on new data ---
Input Data:
    ph  Hardness   Solids  Chloramines  Sulfate  Conductivity  Organic_carbon  \
0  7.0     180.0  20000.0          7.0    350.0         400.0            15.0   

   Trihalomethanes  Turbidity  
0             70.0        4.0  

Predicted Potability (0: Not Potable, 1: Potable):


array([0])

In [None]:
from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import  make_scorer
from sklearn.model_selection import cross_val_score
def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score
nested_score = cross_val_score(grid_search_dt, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score)

              precision    recall  f1-score   support

           0       0.68      0.64      0.66       159
           1       0.49      0.52      0.50       103

    accuracy                           0.60       262
   macro avg       0.58      0.58      0.58       262
weighted avg       0.60      0.60      0.60       262

              precision    recall  f1-score   support

           0       0.68      0.72      0.70       159
           1       0.53      0.48      0.50       103

    accuracy                           0.63       262
   macro avg       0.60      0.60      0.60       262
weighted avg       0.62      0.63      0.62       262

              precision    recall  f1-score   support

           0       0.68      0.67      0.67       159
           1       0.50      0.50      0.50       103

    accuracy                           0.60       262
   macro avg       0.59      0.59      0.59       262
weighted avg       0.60      0.60      0.60       262

              preci

In [None]:

model = DecisionTreeClassifier()
criterion = ["gini", "entropy"]
splitter = ["best", "random"]
max_depth = [2, 4, 6, 8, 10, 12]  # Adding max_depth for tuning
min_samples_split = [2, 4, 6, 8, 10]
min_samples_leaf = [1, 2, 3, 4, 5] # Adding min_samples_leaf for tuning

# define grid search
grid = dict(splitter=splitter, criterion=criterion, max_depth=max_depth,
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search_dt = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='accuracy',error_score=0)

grid_search_dt.fit(X_train, Y_train)

# summarize results
print(f"Best: {grid_search_dt.best_score_:.3f} using {grid_search_dt.best_params_}")
means = grid_search_dt.cv_results_['mean_test_score']
stds = grid_search_dt.cv_results_['std_test_score']
params = grid_search_dt.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print(f"{mean:.3f} ({stdev:.3f}) with: {param}")

print("Training Score:",grid_search_dt.score(X_train, Y_train)*100)
print("Testing Score:", grid_search_dt.score(X_test, Y_test)*100)


def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score

best_model = grid_search_dt.best_estimator_
nested_score = cross_val_score(best_model, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
print("Nested Cross-Validation Scores:", nested_score)


hpt_y_predicted = grid_search_dt.predict(X_test)


hpt_accuracy = accuracy_score(Y_test, hpt_y_predicted)
print("Accuracy:", hpt_accuracy)
print("Classification Report:")
print(classification_report(Y_test, hpt_y_predicted))

print("\n--- Hyperparameter Tuning Results ---")
print(f"Best accuracy achieved during CV: {grid_search_dt.best_score_:.3f}")
print(f"Best parameters found: {grid_search_dt.best_params_}")
print("\n--- Performance on Training and Test Sets using Best Model ---")
print("Training Accuracy:", grid_search_dt.score(X_train, Y_train)*100)
print("Testing Accuracy:", grid_search_dt.score(X_test, Y_test)*100)
print("\n--- Classification Report on Test Set ---")
print(classification_report(Y_test, hpt_y_predicted))
print("\n--- Accuracy Score on Test Set ---")
print("Accuracy:", hpt_accuracy)



def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score
nested_score = cross_val_score(best_model, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
print ("Nested Cross-Validation Scores:", nested_score)



Best: 0.642 using {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 8, 'splitter': 'best'}
0.614 (0.008) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
0.608 (0.010) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
0.614 (0.008) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'splitter': 'best'}
0.610 (0.012) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'splitter': 'random'}
0.614 (0.008) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 6, 'splitter': 'best'}
0.608 (0.011) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 6, 'splitter': 'random'}
0.614 (0.008) with: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 8, 'splitter': 'bes

In [None]:
from sklearn.metrics import  make_scorer
from sklearn.model_selection import cross_val_score

def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score


nested_score = cross_val_score(grid_search_dt, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score)

              precision    recall  f1-score   support

           0       0.69      0.66      0.67       159
           1       0.50      0.53      0.52       103

    accuracy                           0.61       262
   macro avg       0.60      0.60      0.60       262
weighted avg       0.61      0.61      0.61       262

              precision    recall  f1-score   support

           0       0.64      0.71      0.67       159
           1       0.47      0.39      0.42       103

    accuracy                           0.58       262
   macro avg       0.55      0.55      0.55       262
weighted avg       0.57      0.58      0.58       262

              precision    recall  f1-score   support

           0       0.68      0.68      0.68       159
           1       0.51      0.51      0.51       103

    accuracy                           0.61       262
   macro avg       0.60      0.60      0.60       262
weighted avg       0.62      0.61      0.61       262

              preci

In [None]:

best_model = grid_search_dt.best_estimator_

# Evaluate on the test set
hpt_y_predicted = best_model.predict(X_test)

# Calculate and print evaluation metrics
hpt_accuracy = accuracy_score(Y_test, hpt_y_predicted)

print("\n--- Final Model Evaluation ---")
print(f"Best parameters found during tuning: {grid_search_dt.best_params_}")
print("\n--- Performance on Training and Test Sets using Best Model ---")
print("Training Accuracy:", best_model.score(X_train, Y_train)*100)
print("Testing Accuracy:", hpt_accuracy*100) # Use calculated test accuracy
print("\n--- Classification Report on Test Set ---")
print(classification_report(Y_test, hpt_y_predicted))
print("\n--- Accuracy Score on Test Set ---")
print("Accuracy:", hpt_accuracy)

# Perform nested cross-validation with the best model
def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score

nested_score = cross_val_score(best_model, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
print("\n--- Nested Cross-Validation Scores (using the best model) ---")
nested_score




--- Final Model Evaluation ---
Best parameters found during tuning: {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 8, 'splitter': 'best'}

--- Performance on Training and Test Sets using Best Model ---
Training Accuracy: 73.81679389312977
Testing Accuracy: 64.17682926829268

--- Classification Report on Test Set ---
              precision    recall  f1-score   support

           0       0.68      0.83      0.74       412
           1       0.53      0.33      0.41       244

    accuracy                           0.64       656
   macro avg       0.60      0.58      0.57       656
weighted avg       0.62      0.64      0.62       656


--- Accuracy Score on Test Set ---
Accuracy: 0.6417682926829268
              precision    recall  f1-score   support

           0       0.66      0.87      0.75       159
           1       0.60      0.29      0.39       103

    accuracy                           0.65       262
   macro avg       0.63      0.58   

array([0.64503817, 0.65648855, 0.67175573, 0.6259542 , 0.65648855,
       0.66030534, 0.66412214, 0.66030534, 0.61450382, 0.6259542 ,
       0.61068702, 0.63358779, 0.60687023, 0.60687023, 0.67175573,
       0.64122137, 0.69465649, 0.66412214, 0.61832061, 0.63740458,
       0.66030534, 0.6221374 , 0.65648855, 0.64503817, 0.64122137,
       0.61832061, 0.6259542 , 0.66030534, 0.61450382, 0.6259542 ])

In [None]:

dt_y_predicted = grid_search_dt.predict(X_test)
dt_y_predicted

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,

In [None]:
grid_search_dt.best_params_

{'criterion': 'gini', 'min_samples_split': 10, 'splitter': 'random'}

In [None]:

dt_grid_score=accuracy_score(Y_test, dt_y_predicted)
dt_grid_score

0.6189024390243902

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Y_test, dt_y_predicted)

array([[283, 129],
       [121, 123]])

KNN HPT

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 31)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
grid_search_knn = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='accuracy',error_score=0)
grid_search_knn.fit(X_train, Y_train)

# summarize results
print(f"Best: {grid_search_knn.best_score_:.3f} using {grid_search_knn.best_params_}")
means = grid_search_knn.cv_results_['mean_test_score']
stds = grid_search_knn.cv_results_['std_test_score']
params = grid_search_knn.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print(f"{mean:.3f} ({stdev:.3f}) with: {param}")

Best: 0.595 using {'metric': 'manhattan', 'n_neighbors': 16, 'weights': 'uniform'}
0.534 (0.019) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.534 (0.019) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.587 (0.026) with: {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'}
0.534 (0.019) with: {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'distance'}
0.549 (0.029) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.547 (0.027) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.582 (0.022) with: {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'uniform'}
0.555 (0.024) with: {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'distance'}
0.564 (0.031) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.566 (0.036) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.584 (0.024) with: {'metric': 'euclidean', 'n_neighbors': 6, 'weights':

In [None]:
from sklearn.metrics import  make_scorer
from sklearn.model_selection import cross_val_score

def classification_report_with_accuracy_score(Y_test, y_pred2):
    print (classification_report(Y_test, y_pred2)) # print classification report
    return accuracy_score(Y_test, y_pred2) # return accuracy score


nested_score = cross_val_score(grid_search_knn, X=X_train, y=Y_train, cv=cv,
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score)

              precision    recall  f1-score   support

           0       0.60      0.84      0.70       159
           1       0.36      0.14      0.20       103

    accuracy                           0.56       262
   macro avg       0.48      0.49      0.45       262
weighted avg       0.51      0.56      0.50       262

              precision    recall  f1-score   support

           0       0.60      0.84      0.70       159
           1       0.38      0.16      0.22       103

    accuracy                           0.57       262
   macro avg       0.49      0.50      0.46       262
weighted avg       0.52      0.57      0.51       262

              precision    recall  f1-score   support

           0       0.61      0.92      0.73       159
           1       0.43      0.09      0.15       103

    accuracy                           0.60       262
   macro avg       0.52      0.51      0.44       262
weighted avg       0.54      0.60      0.50       262

              preci

In [None]:
knn_y_predicted = grid_search_knn.predict(X_test)

In [None]:
knn_y_predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [None]:
knn_grid_score=accuracy_score(Y_test, knn_y_predicted)

In [None]:
knn_grid_score

0.600609756097561

In [None]:
grid_search_knn.best_params_


{'metric': 'manhattan', 'n_neighbors': 16, 'weights': 'uniform'}

In [None]:
confusion_matrix(Y_test, knn_y_predicted)

array([[371,  41],
       [221,  23]])

Prediction on only one set of data

In [None]:
X_KNN=grid_search_knn.predict([[5.735724, 158.318741,25363.016594,7.728601,377.543291,568.304671,13.626624,75.952337,4.732954]])



In [None]:
X_KNN

array([0])

In [None]:


print("\n--- KNN Hyperparameter Tuning Results ---")
print(f"Best accuracy achieved during CV: {grid_search_knn.best_score_:.3f}")
print(f"Best parameters found: {grid_search_knn.best_params_}")
print("\n--- Performance on Training and Test Sets using Best Model ---")
print("Training Accuracy:", grid_search_knn.score(X_train, Y_train)*100)
print("Testing Accuracy:", grid_search_knn.score(X_test, Y_test)*100)
print("\n--- Classification Report on Test Set ---")
print(classification_report(Y_test, knn_y_predicted))
print("\n--- Accuracy Score on Test Set ---")
print("Accuracy:", knn_grid_score)
print("\n--- Confusion Matrix on Test Set ---")
print(confusion_matrix(Y_test, knn_y_predicted))

print("\n--- Prediction on new data using KNN ---")
print("Input Data:")
print(new_data) # Assuming new_data is defined from previous code
print("\nPredicted Potability (0: Not Potable, 1: Potable):")
knn_predictions = grid_search_knn.predict(new_data)
knn_predictions



--- KNN Hyperparameter Tuning Results ---
Best accuracy achieved during CV: 0.595
Best parameters found: {'metric': 'manhattan', 'n_neighbors': 16, 'weights': 'uniform'}

--- Performance on Training and Test Sets using Best Model ---
Training Accuracy: 63.93129770992366
Testing Accuracy: 60.0609756097561

--- Classification Report on Test Set ---
              precision    recall  f1-score   support

           0       0.63      0.90      0.74       412
           1       0.36      0.09      0.15       244

    accuracy                           0.60       656
   macro avg       0.49      0.50      0.44       656
weighted avg       0.53      0.60      0.52       656


--- Accuracy Score on Test Set ---
Accuracy: 0.600609756097561

--- Confusion Matrix on Test Set ---
[[371  41]
 [221  23]]

--- Prediction on new data using KNN ---
Input Data:
    ph  Hardness   Solids  Chloramines  Sulfate  Conductivity  Organic_carbon  \
0  7.0     180.0  20000.0          7.0    350.0         400.0  

array([0])

In [None]:


from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
new_data_normalized = scaler.transform(new_data)


model_knn_norm = KNeighborsClassifier()
n_neighbors = range(1, 31)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']


grid_knn_norm = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)
cv_knn_norm = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
grid_search_knn_norm = GridSearchCV(estimator=model_knn_norm, param_grid=grid_knn_norm, n_jobs=-1, cv=cv_knn_norm,
                           scoring='accuracy', error_score=0)
grid_search_knn_norm.fit(X_train_normalized, Y_train)


print("\n--- KNN Hyperparameter Tuning Results with Normalization ---")
print(f"Best: {grid_search_knn_norm.best_score_:.3f} using {grid_search_knn_norm.best_params_}")
means_knn_norm = grid_search_knn_norm.cv_results_['mean_test_score']
stds_knn_norm = grid_search_knn_norm.cv_results_['std_test_score']
params_knn_norm = grid_search_knn_norm.cv_results_['params']

for mean, stdev, param in zip(means_knn_norm, stds_knn_norm, params_knn_norm):
    print(f"{mean:.3f} ({stdev:.3f}) with: {param}")

# Evaluate the best model on the test set
best_model_knn_norm = grid_search_knn_norm.best_estimator_
knn_norm_y_predicted = best_model_knn_norm.predict(X_test_normalized)

# Calculate and print evaluation metrics
knn_norm_accuracy = accuracy_score(Y_test, knn_norm_y_predicted)

print("\n--- Performance on Training and Test Sets using Best Normalized KNN Model ---")
print("Training Accuracy:", best_model_knn_norm.score(X_train_normalized, Y_train)*100)
print("Testing Accuracy:", knn_norm_accuracy*100)
print("\n--- Classification Report on Test Set (Normalized Data) ---")
print(classification_report(Y_test, knn_norm_y_predicted))
print("\n--- Accuracy Score on Test Set (Normalized Data) ---")
print("Accuracy:", knn_norm_accuracy)
print("\n--- Confusion Matrix on Test Set (Normalized Data) ---")
print(confusion_matrix(Y_test, knn_norm_y_predicted))



knn_norm_predictions_new_data = best_model_knn_norm.predict(new_data_normalized)

print("\n--- Prediction on new data using Normalized KNN ---")
print("Input Data (Normalized):")
print(new_data_normalized)
print("\nPredicted Potability (0: Not Potable, 1: Potable):")
knn_norm_predictions_new_data




--- KNN Hyperparameter Tuning Results with Normalization ---
Best: 0.648 using {'metric': 'manhattan', 'n_neighbors': 28, 'weights': 'distance'}
0.591 (0.036) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.591 (0.036) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.626 (0.019) with: {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'}
0.591 (0.036) with: {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'distance'}
0.606 (0.029) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.606 (0.029) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.624 (0.024) with: {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'uniform'}
0.611 (0.032) with: {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'distance'}
0.625 (0.037) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.624 (0.037) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.632 (0.

array([0])

In [None]:


best_model_dt = grid_search_dt.best_estimator_


dt_y_predicted_final = best_model_dt.predict(X_test)

dt_accuracy_final = accuracy_score(Y_test, dt_y_predicted_final)

print("\n--- Final Decision Tree Model Evaluation ---")
print(f"Best parameters found during tuning: {grid_search_dt.best_params_}")
print("\n--- Performance on Training and Test Sets using Best Decision Tree Model ---")
print("Training Accuracy:", best_model_dt.score(X_train, Y_train)*100)
print("Testing Accuracy:", dt_accuracy_final*100)
print("\n--- Classification Report on Test Set (Decision Tree) ---")
print(classification_report(Y_test, dt_y_predicted_final))
print("\n--- Accuracy Score on Test Set (Decision Tree) ---")
print("Accuracy:", dt_accuracy_final)
print("\n--- Confusion Matrix on Test Set (Decision Tree) ---")
print(confusion_matrix(Y_test, dt_y_predicted_final))


# KNN HPT with Normalization results are already calculated and printed.
# We will just present the final evaluation results using the best Normalized KNN model.

print("\n--- Final Normalized KNN Model Evaluation ---")
print(f"Best parameters found during tuning: {grid_search_knn_norm.best_params_}")
print("\n--- Performance on Training and Test Sets using Best Normalized KNN Model ---")
print("Training Accuracy:", best_model_knn_norm.score(X_train_normalized, Y_train)*100)
print("Testing Accuracy:", knn_norm_accuracy*100)
print("\n--- Classification Report on Test Set (Normalized KNN) ---")
print(classification_report(Y_test, knn_norm_y_predicted))
print("\n--- Accuracy Score on Test Set (Normalized KNN) ---")
print("Accuracy:", knn_norm_accuracy)
print("\n--- Confusion Matrix on Test Set (Normalized KNN) ---")
print(confusion_matrix(Y_test, knn_norm_y_predicted))


knn_norm_predictions_new_data_final = best_model_knn_norm.predict(new_data_normalized)
print("\n--- Prediction on new data using Best Normalized KNN ---")
print("Input Data (Normalized):")
print(new_data_normalized)
print("\nPredicted Potability (0: Not Potable, 1: Potable):")
knn_norm_predictions_new_data_final




--- Final Decision Tree Model Evaluation ---
Best parameters found during tuning: {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 8, 'splitter': 'best'}

--- Performance on Training and Test Sets using Best Decision Tree Model ---
Training Accuracy: 73.81679389312977
Testing Accuracy: 64.17682926829268

--- Classification Report on Test Set (Decision Tree) ---
              precision    recall  f1-score   support

           0       0.68      0.83      0.74       412
           1       0.53      0.33      0.41       244

    accuracy                           0.64       656
   macro avg       0.60      0.58      0.57       656
weighted avg       0.62      0.64      0.62       656


--- Accuracy Score on Test Set (Decision Tree) ---
Accuracy: 0.6417682926829268

--- Confusion Matrix on Test Set (Decision Tree) ---
[[341  71]
 [164  80]]

--- Final Normalized KNN Model Evaluation ---
Best parameters found during tuning: {'metric': 'manhattan', 'n_neighb

array([0])

In [None]:

print("Dataset Information:")
df.info()

# Display the first 5 rows
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Display the distribution of the target variable 'Potability'
print("\nDistribution of Potability:")
print(df['Potability'].value_counts())

# Display the percentage distribution of the target variable 'Potability'
print("\nPercentage Distribution of Potability:")
print(df['Potability'].value_counts(normalize=True) * 100)

# Display the correlation matrix
print("\nCorrelation Matrix:")
print(df.corr())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               3276 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          3276 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3276 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB

First 5 rows of the dataset:
         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0  7.080795  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246  333.775777    592.885359  