In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
data = pd.read_csv('water_potability.csv')

# Impute missing values
# Mean imputation for numerical columns
numerical_cols = data.select_dtypes(include=['number']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Separate features and target variable
X = data.drop('Potability', axis=1)  # Features
y = data['Potability']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_accuracy = knn_model.score(X_test, y_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

report = classification_report(y_test, predictions)
print(f'Classification Report:\n{report}')

confusion_mat = confusion_matrix(y_test, predictions)
print(f'Confusion Matrix:\n{confusion_mat}')



Accuracy: 0.7875
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.94      0.85        53
           1       0.81      0.48      0.60        27

    accuracy                           0.79        80
   macro avg       0.80      0.71      0.73        80
weighted avg       0.79      0.79      0.77        80

Confusion Matrix:
[[50  3]
 [14 13]]


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_csv('water_potability.csv')

# Impute missing values
# Mean imputation for numerical columns
numerical_cols = data.select_dtypes(include=['number']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())


# Separate features and target variable
X = data.drop('Potability', axis=1)  # Features
y = data['Potability']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

report = classification_report(y_test, predictions)
print(f'Classification Report:\n{report}')

confusion_mat = confusion_matrix(y_test, predictions)
print(f'Confusion Matrix:\n{confusion_mat}')



Accuracy: 0.7875
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.94      0.85        53
           1       0.81      0.48      0.60        27

    accuracy                           0.79        80
   macro avg       0.80      0.71      0.73        80
weighted avg       0.79      0.79      0.77        80

Confusion Matrix:
[[50  3]
 [14 13]]


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB  # Import Gaussian Naive Bayes
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('water_potability.csv')

# Impute missing values
# Mean imputation for numerical columns
numerical_cols = data.select_dtypes(include=['number']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Separate features and target variable
X = data.drop('Potability', axis=1)  # Features
y = data['Potability']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Naive Bayes classifier (Gaussian Naive Bayes)
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

report = classification_report(y_test, predictions)
print(f'Classification Report:\n{report}')

confusion_mat = confusion_matrix(y_test, predictions)
print(f'Confusion Matrix:\n{confusion_mat}')


Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.83      0.76        53
           1       0.47      0.30      0.36        27

    accuracy                           0.65        80
   macro avg       0.58      0.56      0.56        80
weighted avg       0.62      0.65      0.63        80

Confusion Matrix:
[[44  9]
 [19  8]]


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
# Load your custom dataset (replace 'water_potability.csv' with your dataset file)
df = pd.read_csv('water_potability.csv')

# Separate features and target variable
X = df.drop('Potability', axis=1)
y = df['Potability']

# Encode the target variable if it's categorical (e.g., 'good', 'fair', 'poor')
le = LabelEncoder()
y = le.fit_transform(y)

# Check for missing values in X
missing_values = X.isnull().sum()
print("Missing Values:\n", missing_values)

# Impute missing values for numerical columns using mean imputation
numerical_cols = X.select_dtypes(include=['number']).columns
for col in numerical_cols:
    X[col].fillna(X[col].mean(), inplace=True)

# Check again for missing values
missing_values_after_imputation = X.isnull().sum()
print("Missing Values After Imputation:\n", missing_values_after_imputation)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize/Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Missing Values:
 ph                 60
Hardness            0
Solids              0
Chloramines         0
Sulfate            99
Conductivity        0
Organic_carbon      0
Trihalomethanes    18
Turbidity           0
dtype: int64
Missing Values After Imputation:
 ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
dtype: int64


In [15]:
svm_classifier = SVC(kernel='linear', C=1)  # You can change the kernel and hyperparameters
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)
print(classification_report(y_test, svm_predictions))


SVM Accuracy: 0.7125
              precision    recall  f1-score   support

           0       0.70      1.00      0.82        53
           1       1.00      0.15      0.26        27

    accuracy                           0.71        80
   macro avg       0.85      0.57      0.54        80
weighted avg       0.80      0.71      0.63        80



In [12]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print(classification_report(y_test, nb_predictions))


Naive Bayes Accuracy: 0.65
              precision    recall  f1-score   support

           0       0.70      0.83      0.76        53
           1       0.47      0.30      0.36        27

    accuracy                           0.65        80
   macro avg       0.58      0.56      0.56        80
weighted avg       0.62      0.65      0.63        80



In [13]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust hyperparameters
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print(classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.7875
              precision    recall  f1-score   support

           0       0.78      0.94      0.85        53
           1       0.81      0.48      0.60        27

    accuracy                           0.79        80
   macro avg       0.80      0.71      0.73        80
weighted avg       0.79      0.79      0.77        80

