In [2]:
# import packages
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [54]:
# get the data
data= pd.read_csv('waterQuality1.csv')

In [56]:
# EDA

In [74]:
data.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,1.65,9.08,0.04,2.85,0.007,0.35,0.83,0.17,0.05,0.2,...,0.054,16.08,1.13,0.007,37.75,6.78,0.08,0.34,0.02,1
1,2.32,21.16,0.01,3.31,0.002,5.28,0.68,0.66,0.9,0.65,...,0.1,2.01,1.93,0.003,32.26,3.21,0.08,0.27,0.05,1
2,1.01,14.02,0.04,0.58,0.008,4.24,0.53,0.02,0.99,0.05,...,0.078,14.16,1.11,0.006,50.28,7.07,0.07,0.44,0.01,0
3,1.36,11.33,0.04,2.96,0.001,7.23,0.03,1.66,1.08,0.71,...,0.016,1.41,1.29,0.004,9.12,1.72,0.02,0.45,0.05,1
4,0.92,24.33,0.03,0.2,0.006,2.67,0.69,0.57,0.61,0.13,...,0.117,6.74,1.11,0.003,16.9,2.41,0.02,0.06,0.02,1


In [76]:
data.shape

(7999, 21)

In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 7999 entries, 0 to 7998

Data columns (total 21 columns):

 #   Column       Non-Null Count  Dtype  

---  ------       --------------  -----  

 0   aluminium    7999 non-null   float64

 1   ammonia      7999 non-null   float64

 2   arsenic      7999 non-null   float64

 3   barium       7999 non-null   float64

 4   cadmium      7999 non-null   float64

 5   chloramine   7999 non-null   float64

 6   chromium     7999 non-null   float64

 7   copper       7999 non-null   float64

 8   flouride     7999 non-null   float64

 9   bacteria     7999 non-null   float64

 10  viruses      7999 non-null   float64

 11  lead         7999 non-null   float64

 12  nitrates     7999 non-null   float64

 13  nitrites     7999 non-null   float64

 14  mercury      7999 non-null   float64

 15  perchlorate  7999 non-null   float64

 16  radium       7999 non-null   float64

 17  selenium     7999 non-null   float64

 18  silver       7999 non-nul

In [80]:
data.isna().sum()

aluminium      0
ammonia        0
arsenic        0
barium         0
cadmium        0
chloramine     0
chromium       0
copper         0
flouride       0
bacteria       0
viruses        0
lead           0
nitrates       0
nitrites       0
mercury        0
perchlorate    0
radium         0
selenium       0
silver         0
uranium        0
is_safe        0
dtype: int64

In [66]:
# make ammonia -> numeric
data.loc[data['ammonia'] == '#NUM!', 'ammonia']= data['ammonia'].mode()[0]

In [68]:
data['ammonia']= data['ammonia'].astype(float)

In [72]:
# make is_safe -> numeric
data.loc[data['is_safe'] == '#NUM!', 'is_safe']= data['is_safe'].mode()[0]
data['is_safe']= data['is_safe'].astype(int)

In [82]:
data['is_safe'].value_counts()

is_safe
0    7087
1     912
Name: count, dtype: int64

In [86]:
# clearly its unbalanced, so upsample the minority
majority_data= data[ data['is_safe'] == 0 ]
minority_data= data[ data['is_safe'] == 1 ]

majority_data.shape, minority_data.shape

((7087, 21), (912, 21))

In [88]:
upsampled_minority_data = resample(minority_data, replace=True, n_samples=len(majority_data), random_state=42)

# Reset the index of the upsampled minority data
upsampled_minority_data = upsampled_minority_data.reset_index(drop=True)

# Reset index for majority data too (if necessary)
majority_data = majority_data.reset_index(drop=True)

# Combine the datasets into balanced_data
balanced_data = pd.concat([majority_data, upsampled_minority_data]).reset_index(drop=True)

In [90]:
balanced_data= balanced_data.sample(frac=1)

In [92]:
balanced_data.shape

(14174, 21)

In [96]:
balanced_data['is_safe'].value_counts()

is_safe
0    7087
1    7087
Name: count, dtype: int64

In [98]:
# balanced_data :- EDA
balanced_data.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
3230,0.05,22.53,0.02,0.62,0.06,0.35,0.05,0.07,0.87,0.0,...,0.079,8.64,0.6,0.007,2.32,5.26,0.06,0.01,0.05,0
12384,0.31,1.91,0.04,0.48,0.004,4.59,0.54,1.31,0.33,0.0,...,0.113,8.5,1.6,0.01,25.36,1.84,0.02,0.11,0.0,1
13008,0.04,5.62,0.03,0.58,0.08,0.01,0.01,0.07,0.25,0.04,...,0.135,2.4,0.47,0.01,7.45,0.1,0.09,0.07,0.02,1
9754,0.07,14.0,0.01,0.49,0.05,0.3,0.08,0.07,0.52,0.0,...,0.146,7.72,0.27,0.003,4.12,1.05,0.07,0.01,0.01,1
10851,2.33,29.23,0.04,2.76,0.006,6.06,0.46,0.14,0.75,0.51,...,0.093,4.2,1.88,0.009,12.48,4.54,0.07,0.38,0.08,1


In [100]:
balanced_data.shape

(14174, 21)

In [102]:
balanced_data.info()

<class 'pandas.core.frame.DataFrame'>

Index: 14174 entries, 3230 to 12681

Data columns (total 21 columns):

 #   Column       Non-Null Count  Dtype  

---  ------       --------------  -----  

 0   aluminium    14174 non-null  float64

 1   ammonia      14174 non-null  float64

 2   arsenic      14174 non-null  float64

 3   barium       14174 non-null  float64

 4   cadmium      14174 non-null  float64

 5   chloramine   14174 non-null  float64

 6   chromium     14174 non-null  float64

 7   copper       14174 non-null  float64

 8   flouride     14174 non-null  float64

 9   bacteria     14174 non-null  float64

 10  viruses      14174 non-null  float64

 11  lead         14174 non-null  float64

 12  nitrates     14174 non-null  float64

 13  nitrites     14174 non-null  float64

 14  mercury      14174 non-null  float64

 15  perchlorate  14174 non-null  float64

 16  radium       14174 non-null  float64

 17  selenium     14174 non-null  float64

 18  silver       14174 non-nu

In [104]:
balanced_data.isna().sum()

aluminium      0
ammonia        0
arsenic        0
barium         0
cadmium        0
chloramine     0
chromium       0
copper         0
flouride       0
bacteria       0
viruses        0
lead           0
nitrates       0
nitrites       0
mercury        0
perchlorate    0
radium         0
selenium       0
silver         0
uranium        0
is_safe        0
dtype: int64

In [106]:
# features and labels

X= balanced_data.drop('is_safe', axis= 1)

Y= balanced_data['is_safe']

In [108]:
# train test
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.2)

Overview:-

~ Numeric Imputer := KNN Iterator

~ Power transformer

~ PCA

~ logistic regression

In [114]:
# numeric transformer := imputer + power transformer
numeric_transformer= Pipeline(
    [
        ('numeric_imputer', KNNImputer(weights='distance')),
        ('power_transformer', PowerTransformer())
    ]
)

In [116]:
# preprocessor
preprocessor= ColumnTransformer([
    ('numeric_transformer', numeric_transformer, slice(0, 20))
])

In [118]:
# model
model= LogisticRegression()

In [142]:
# pipeline
pipe= Pipeline([
    ('preprocessor', preprocessor),
    ('PCA', PCA(n_components=17)),
    ('model', model),
])

In [144]:
# fit it
pipe.fit(X_train, y_train)

In [146]:
# predict
y_preds= pipe.predict(X_test)

In [148]:
# evaluate

In [150]:
accuracy_score(y_test, y_preds)

0.818342151675485

In [152]:
precision_score(y_test, y_preds)

0.8317483540599854

In [154]:
recall_score(y_test, y_preds)

0.79957805907173

In [156]:
f1_score(y_test, y_preds)

0.8153460021513087

In [140]:
# find the best params for PCA
max_acc= 0
best_comps= 0

for i in range(1, 20):

    pipe= Pipeline([
        ('preprocessor', preprocessor),
        ('PCA', PCA(n_components=i)),
        ('model', model),
    ])

    pipe.fit(X_train, y_train)
    
    y_preds= pipe.predict(X_test)
    
    acc= accuracy_score(y_test, y_preds)

    if acc > max_acc:
        max_acc= acc
        best_comps= i

    print(f"{i} comps := {acc}")

print(f"\n\nBest comps: {best_comps}, Accuracy: {max_acc}")

1 comps := 0.6878306878306878

2 comps := 0.7562610229276896

3 comps := 0.7442680776014109

4 comps := 0.7446208112874779

5 comps := 0.7485008818342151

6 comps := 0.7626102292768959

7 comps := 0.7717813051146385

8 comps := 0.7746031746031746

9 comps := 0.7724867724867724

10 comps := 0.7728395061728395

11 comps := 0.7784832451499119

12 comps := 0.7841269841269841

13 comps := 0.7897707231040564

14 comps := 0.7865961199294532

15 comps := 0.7908289241622575

16 comps := 0.8155202821869488

17 comps := 0.818342151675485

18 comps := 0.8130511463844797

19 comps := 0.8134038800705468





Best comps: 17, Accuracy: 0.818342151675485
