In [2]:
import pandas as pd

## Import Data

In [None]:
data = pd.read_csv('MLproject/datasets/water_potability.csv')

## Basic Data Analysis

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.shape

In [None]:
data.describe()

## Data Preprocessing

Handling missing data

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=3)
clean_data = knn_imputer.fit_transform(data)
data = pd.DataFrame(clean_data, columns = data.columns)
data_copy = data.copy()

In [None]:
data.isna().sum()

In [None]:
data.head()

## Machine Learning Algorithms

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# x = data.drop(columns = 'Potability')
X = data.drop('Potability', axis=1)
X.head()

In [None]:
y = data['Potability']
y.head()

In [None]:
# models = [
#     ('CART', DecisionTreeClassifier()),
#     ('Gaussian Naive Bayes', GaussianNB()),
#     ('Gradient Boosting', GradientBoostingClassifier()),
#     ('KNN', KNeighborsClassifier()),
#     ('Logistic Regression', LogisticRegression()),
#     ('MLP', MLPClassifier()),
#     ('Perceptron', Perceptron()),
#     ('Random Forest', RandomForestClassifier())
# ]

# # Train and evaluate models
# accuracy_scores = []
# model_names = []

# for name, model in models:
#     model.fit(X_train, y_train)  # Train the model
#     y_pred = model.predict(X_test)  # Make predictions
#     acc = accuracy_score(y_test, y_pred)  # Calculate accuracy
#     accuracy_percentage = acc * 100
#     accuracy_scores.append(accuracy_percentage)
#     model_names.append(name)
#     print(f'{name} Accuracy: {accuracy_percentage:.2f}%')


In [None]:
size = 0.10
seed = 123

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=seed)


# Train and print accuracy scores for each model
# CART
cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
cart_model_result = cart_model.score(X_test, y_test)
print("CART Accuracy: %.3f%%" % (cart_model_result * 100.0))

# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
gnb_model_result = gnb_model.score(X_test, y_test)
print("Gaussian Naive Bayes Accuracy: %.3f%%" % (gnb_model_result * 100.0))

# Gradient Boosting (AdaBoost)
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_model_result = gb_model.score(X_test, y_test)
print("Gradient Boosting Accuracy: %.3f%%" % (gb_model_result * 100.0))

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')
knn_model.fit(X_train, y_train)
knn_model_result = knn_model.score(X_test, y_test)
print("KNN Accuracy: %.3f%%" % (knn_model_result * 100.0))

# Logistic Regression
lr_model = LogisticRegression(max_iter=200, solver='lbfgs', C=1.0)
lr_model.fit(X_train, y_train)
lr_model_result = lr_model.score(X_test, y_test)
print("Logistic Regression Accuracy: %.3f%%" % (lr_model_result * 100.0))

# MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(65, 32), 
                          activation='relu', 
                          solver='adam', max_iter=200, random_state=seed)
mlp_model.fit(X_train, y_train)
mlp_model_result = mlp_model.score(X_test, y_test)
print("MLP Accuracy: %.3f%%" % (mlp_model_result * 100.0))

#Perceptron
p_model = Perceptron(max_iter=200, random_state=seed, eta0=1.0, tol=1e-3)
p_model.fit(X_train, y_train)

pmodel_result = p_model.score(X_test, y_test)
print("Perceptron Accuracy: %.3f%%" % (pmodel_result * 100.0))

#Random Forest
rfmodel = RandomForestClassifier(n_estimators=100, 
                                 random_state=seed, 
                                 max_depth=None, 
                                 min_samples_split=2, 
                                 min_samples_leaf=1)
rfmodel.fit(X_train, y_train)

rfmodel_result = rfmodel.score(X_test, y_test)
print("Random Forest Accuracy: %.3f%%" % (rfmodel_result * 100.0))



In [None]:

# List of model names
model_names = [
    "CART",
    "Gaussian Naive Bayes",
    "Gradient Boosting Machines (AdaBoost)",
    "K-Nearest Neighbors (K-NN)",
    "Logistic Regression",
    "Multi-Layer Perceptron (MLP)",
    "Perceptron",
    "Random Forest"
]

# List of accuracy scores corresponding to each model name
accuracy_scores = [
    cart_model_result,
    gnb_model_result,
    gb_model_result,
    knn_model_result,
    lr_model_result,
    mlp_model_result,
    pmodel_result,
    rfmodel_result
]

sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=accuracy_scores, y=model_names, hue=model_names, palette="viridis", dodge=False, legend=False)
ax.set_title('Accuracy Comparison of Different Models')
ax.set_xlabel('Accuracy (%)')
ax.set_ylabel('Model')
plt.show()

In [None]:
import joblib

In [None]:
model_filename = 'MLproject/datasets/models/random_forest_model.joblib'
joblib.dump(rfmodel, model_filename)

In [None]:
# Load the saved Random Forest model
loaded_model = joblib.load(model_filename)

feature_names = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
X_with_features = pd.DataFrame([[7.0, 200, 300, 10.0, 50.0, 400, 5.0, 60.0, 4.0]], columns=feature_names)

# Make predictions using the loaded model
predicted_potability = loaded_model.predict(X_with_features)

# Define messages based on the predicted class
if predicted_potability == 0:
    message = "The water sample is predicted to be non-potable."
elif predicted_potability == 1:
    message = "The water sample is predicted to be potable."
else:
    message = "Invalid prediction."

print(message)