# Get The Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/water_potability.csv')
df.tail()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,,392.44958,19.903225,,2.798243,1
3273,9.41951,175.762646,33155.578218,7.350233,,432.044783,11.03907,69.8454,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1
3275,7.874671,195.102299,17404.177061,7.509306,,327.45976,16.140368,78.698446,2.309149,1


# Dependent and Independent Variables

In [3]:
dependentvariable = df['Potability']
df = df.drop('Potability', axis=1)

# Get the Missing Values (Part A)

**Mask all the values as booleans**

In [4]:
missing_mask = df.isna()
print(missing_mask)

         ph  Hardness  Solids  Chloramines  Sulfate  Conductivity  \
0      True     False   False        False    False         False   
1     False     False   False        False     True         False   
2     False     False   False        False     True         False   
3     False     False   False        False    False         False   
4     False     False   False        False    False         False   
...     ...       ...     ...          ...      ...           ...   
3271  False     False   False        False    False         False   
3272  False     False   False        False     True         False   
3273  False     False   False        False     True         False   
3274  False     False   False        False     True         False   
3275  False     False   False        False     True         False   

      Organic_carbon  Trihalomethanes  Turbidity  
0              False            False      False  
1              False            False      False  
2              Fal

**Get the Missing Values by Features**

In [5]:
missing_by_feature = pd.Series({
    col: df.index[df[col].isna()].tolist()
    for col in df.columns
})
print(missing_by_feature)

ph                 [0, 8, 13, 20, 22, 27, 28, 29, 34, 38, 40, 48,...
Hardness                                                          []
Solids                                                            []
Chloramines                                                       []
Sulfate            [1, 2, 11, 14, 16, 18, 20, 23, 31, 34, 37, 38,...
Conductivity                                                      []
Organic_carbon                                                    []
Trihalomethanes    [62, 81, 110, 118, 119, 141, 158, 185, 201, 23...
Turbidity                                                         []
dtype: object


This line creates a pandas Series that maps each feature (that is, each column) to the list of row indexes where that feature has missing values (NaN).

It works by looping through every column name in the DataFrame. For each column, df[col].isna() produces a Boolean mask that marks which rows contain missing values.

Then, df.index[...] uses that mask to select the corresponding row indexes. Finally, .tolist() turns those indexes into a list.

The dictionary comprehension collects all these lists, one per column, and pd.Series() converts it into a Series where each key is a feature name.

In [6]:
MissingFeatures = missing_by_feature.apply(len)
print(MissingFeatures)

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
dtype: int64


**Get the Missing Features per Variable**

In [7]:
missing_by_variable = pd.Series({
  i: missing_cols
    for i in df.index
    if (missing_cols := df.columns[df.iloc[i].isna()].tolist())
})
print(missing_by_variable)

0                             [ph]
1                        [Sulfate]
2                        [Sulfate]
8                             [ph]
11                       [Sulfate]
                   ...            
3266                     [Sulfate]
3272    [Sulfate, Trihalomethanes]
3273                     [Sulfate]
3274                     [Sulfate]
3275                     [Sulfate]
Length: 1265, dtype: object


This line creates another pandas Series that maps each variable (that is, each row) to the list of feature names where that row has missing values.

It loops through each row index in the DataFrame.

For each one, df.iloc[i].isna() gives a Boolean mask showing which columns have missing data in that row.

Then, df.columns[...] uses that mask to filter out the column names with missing values, and .tolist() converts those column names into a list.

The walrus operator := temporarily stores this list as missing_cols. The if condition keeps only those rows where missing_cols is not empty, meaning the row actually has missing values.

The dictionary is finally turned into a Series with each key being a row index and each value being the list of features missing in that row.

# Splitting The Data (Part B)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df, dependentvariable, test_size=0.2, random_state=42
)

In [10]:
print(X_train.head())

            ph    Hardness        Solids  Chloramines     Sulfate  \
233   6.623614  203.030141  17167.301297     6.049601  311.726288   
831   6.684700  193.840931  34157.184474     9.876574  344.535407   
2658  6.836060  205.667718  18321.327502     6.712854  297.837188   
2495       NaN  183.488839  12675.938962     9.777807  319.870584   
2603  6.406798  182.885137  17851.064021     7.462758  332.486731   

      Conductivity  Organic_carbon  Trihalomethanes  Turbidity  
233     410.243247       15.914500        65.021229   2.915166  
831     498.063996        8.818757        66.659352   4.030660  
2658    494.484249       13.808923        70.714225   4.952508  
2495    482.445026       13.309723        46.853410   3.240419  
2603    398.779746       17.301617        64.070236   4.573968  


# Apply Imputation (Part C)

In [10]:
mean_imputer = SimpleImputer(strategy='mean')
X_train_mean = mean_imputer.fit_transform(X_train)
X_test_mean = mean_imputer.transform(X_test)

In [11]:
knn_imputer = KNNImputer(n_neighbors=5)
X_train_knn = knn_imputer.fit_transform(X_train)
X_test_knn = knn_imputer.transform(X_test)

In [12]:
Train_by_mean_table = pd.DataFrame(X_train_mean, columns=X_train.columns)
Test_by_mean_table = pd.DataFrame(X_test_mean, columns=X_test.columns)


In [13]:
Train_by_KNN_table = pd.DataFrame(X_train_knn, columns=X_train.columns)
Test_by_KNN_table = pd.DataFrame(X_test_knn, columns=X_test.columns)

In [14]:
print(Train_by_KNN_table.head())
print(Train_by_mean_table.head())

         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0  6.623614  203.030141  17167.301297     6.049601  311.726288    410.243247   
1  6.684700  193.840931  34157.184474     9.876574  344.535407    498.063996   
2  6.836060  205.667718  18321.327502     6.712854  297.837188    494.484249   
3  7.598431  183.488839  12675.938962     9.777807  319.870584    482.445026   
4  6.406798  182.885137  17851.064021     7.462758  332.486731    398.779746   

   Organic_carbon  Trihalomethanes  Turbidity  
0       15.914500        65.021229   2.915166  
1        8.818757        66.659352   4.030660  
2       13.808923        70.714225   4.952508  
3       13.309723        46.853410   3.240419  
4       17.301617        64.070236   4.573968  
         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0  6.623614  203.030141  17167.301297     6.049601  311.726288    410.243247   
1  6.684700  193.840931  34157.184474     9.876574  344.535407    498.0

# Apply KNN Clasification (Part D)

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [19]:
knn_mean = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_mean.fit(X_train_mean, y_train)

In [20]:
knn_knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_knn.fit(X_train_knn, y_train)

In [21]:
y_pred_mean = knn_mean.predict(X_test_mean)
y_pred_knn = knn_knn.predict(X_test_knn)

In [22]:
acc_mean = accuracy_score(y_test, y_pred_mean)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy (Mean Imputer):", acc_mean)
print("Accuracy (KNN Imputer):", acc_knn)

Accuracy (Mean Imputer): 0.5411585365853658
Accuracy (KNN Imputer): 0.5472560975609756


# Scale The Data (Experiment)

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [41]:
scaled_train_mean = mean_imputer.fit_transform(X_train)
scaled_test_mean = mean_imputer.transform(X_test)
scaled_train_mean=scaler.fit_transform(scaled_train_mean)
scaled_test_mean=scaler.fit_transform(scaled_test_mean)

In [42]:
scaled_train_knn = knn_imputer.fit_transform(X_train)
scaled_test_knn = knn_imputer.transform(X_test)
scaled_train_knn=scaler.fit_transform(scaled_train_knn)
scaled_test_knn=scaler.fit_transform(scaled_test_knn)


In [43]:
knn_mean_scaled = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_mean_scaled.fit(scaled_train_mean, y_train)

In [44]:
knn_knn_scaled = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_knn_scaled.fit(scaled_train_knn, y_train)

In [45]:
y_pred_mean_scaled = knn_mean_scaled.predict(scaled_test_mean)
y_pred_knn_scaled = knn_knn_scaled.predict(scaled_test_knn)
print("Accuracy (Mean Imputer):", acc_mean)
print("Accuracy (KNN Imputer):", acc_knn)

Accuracy (Mean Imputer): 0.5411585365853658
Accuracy (KNN Imputer): 0.5472560975609756
