Load Libraries

In [314]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import warnings
warnings.filterwarnings("ignore")




Load data

In [316]:
data=pd.read_csv("water_potability.csv")
data.head(2)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0


In [319]:
data.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [320]:
#data.fillna(0, inplace=True)
#data.dropna(axis = 0, how ='any')
data = data.fillna(data.mean())



In [321]:
X=data.iloc[:,:-1]
y=data['Potability']

In [322]:
x_train, x_test, y_train, y_test =train_test_split(X,y,test_size=0.20,random_state=21)
print('Shape of Training Xs:{}'.format(x_train.shape))
print('Shape of Test Xs:{}'.format(x_test.shape))
print('Shape of Training y:{}'.format(y_train.shape))
print('Shape of Test y:{}'.format(y_test.shape))


Shape of Training Xs:(2620, 9)
Shape of Test Xs:(656, 9)
Shape of Training y:(2620,)
Shape of Test y:(656,)


In [323]:
scaler=StandardScaler()

In [324]:
x_train.columns

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity'],
      dtype='object')

In [325]:
x_train[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']]=scaler.fit_transform(x_train[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
      'Organic_carbon', 'Trihalomethanes', 'Turbidity']])

In [326]:
x_test[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']]=scaler.fit_transform(x_test[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']])

In [327]:
models=[("LR",LogisticRegression()),
        ("RM",RandomForestClassifier(max_depth=3)),
        ("svm",SVC(kernel="linear",random_state=0)),
        ("nb",GaussianNB())]

In [328]:
final_result=[]
for name,model in models:
    model.fit(x_train,y_train) # train the model
    score=model.score(x_test,y_test)
    
    final_result.append((name,score))
final_result    

[('LR', 0.6051829268292683),
 ('RM', 0.625),
 ('svm', 0.6051829268292683),
 ('nb', 0.6234756097560976)]

Conclusion: After applying all 6 classification algorithm, Naïve Bayes Classifier Algorithm has high accuracy of 62.04% than other algorithm



Naïve Bayes Classifier Algorithm

In [329]:
from sklearn.naive_bayes import GaussianNB  
nb = GaussianNB()  
nb.fit(x_train, y_train)  
y_predict = nb.predict(x_test)
y_predict[100:105]


array([0, 0, 0, 0, 1], dtype=int64)

In [331]:
import joblib

In [332]:
joblib.dump(nb,open("water_classification_v1.0.model","wb"))
joblib.dump(scaler,open("water_classification_v1.0.scaler","wb"))



In [333]:
model1= joblib.load(open("water_classification_v1.0.model","rb"))
sacler=joblib.load(open("water_classification_v1.0.scaler","rb"))

In [334]:
X.columns

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity'],
      dtype='object')

In [335]:
X.iloc[252]

ph                     7.080795
Hardness             169.974849
Solids             23403.637304
Chloramines            8.519730
Sulfate              333.775777
Conductivity         475.573562
Organic_carbon        12.924107
Trihalomethanes       50.861913
Turbidity              2.747313
Name: 252, dtype: float64

In [336]:
new_data={'ph':7.080795, 'Hardness':169.974849, 'Solids':23403.637304, 
                'Chloramines':8.519730,'Sulfate':333.775777, 'Conductivity':475.573562,
                'Organic_carbon':12.924107,'Trihalomethanes':50.861913, 'Turbidity':2.747313}
index=[1] # serial number

my_data=pd.DataFrame(new_data,index)


In [337]:
my_data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
1,7.080795,169.974849,23403.637304,8.51973,333.775777,475.573562,12.924107,50.861913,2.747313


In [338]:
sc_data = scaler.transform(my_data)


In [339]:
model1.predict(sc_data)

array([0], dtype=int64)

In [340]:
data.iloc[252]

ph                     7.080795
Hardness             169.974849
Solids             23403.637304
Chloramines            8.519730
Sulfate              333.775777
Conductivity         475.573562
Organic_carbon        12.924107
Trihalomethanes       50.861913
Turbidity              2.747313
Potability             1.000000
Name: 252, dtype: float64

In [341]:
pred_data=data.iloc[252:253,:-1].values

In [342]:
scaler_pred_data=scaler.transform(pred_data)

In [343]:
model1.predict(scaler_pred_data)

array([0], dtype=int64)

In [347]:
data.iloc[270:271]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
270,7.291888,127.544297,27784.048484,9.754476,247.335412,439.649329,17.285042,59.55633,5.328713,1
