# **Note proses pengerjaan coding:**
**I. Data-Preprocessing**
1. Baca Dataset
2. Cek jumlah data kosong
3. Cek tipe data
4. Drop kolom yang memiliki fitur yang gak mempengaruhi label (misalnya nama orang, id, dll)
5. Handle missing value
6. Lakukan encoding untuk data categorical
7. Handle Outlier
8. Lakukan scaling data (hanya untuk data X, para fitur)

**II. Model Selection**
1. Train test split
2. Handle imbalance class untuk data training (X_train,y_train)

**III. Creating machine learning model**
1. Import necessary sklearn libraries
2. train model (pake sklearn.algoritmanya)

In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('water_potability.csv')
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [28]:
df.info()
# Alhamdulillah udah aman data typenya

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [29]:
df.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [30]:
def print_null_report(df):
    for i, j in enumerate(df.isna().sum()):
        print(f'{df.columns[i]:15} | {j*100/len(df):.2f}%')

print_null_report(df)

ph              | 14.99%
Hardness        | 0.00%
Solids          | 0.00%
Chloramines     | 0.00%
Sulfate         | 23.84%
Conductivity    | 0.00%
Organic_carbon  | 0.00%
Trihalomethanes | 4.95%
Turbidity       | 0.00%
Potability      | 0.00%


In [31]:
# Handling missing value (gunakan imputasi untuk semuanya) berdasarkan class
df['ph'] = df.groupby('Potability')['ph'].transform(lambda x: x.fillna(x.mean()))
df['Sulfate'] = df.groupby('Potability')['Sulfate'].transform(lambda x: x.fillna(x.mean()))
df['Trihalomethanes'] = df.groupby('Potability')['Trihalomethanes'].transform(lambda x: x.fillna(x.mean()))

print_null_report(df)

ph              | 0.00%
Hardness        | 0.00%
Solids          | 0.00%
Chloramines     | 0.00%
Sulfate         | 0.00%
Conductivity    | 0.00%
Organic_carbon  | 0.00%
Trihalomethanes | 0.00%
Turbidity       | 0.00%
Potability      | 0.00%


In [32]:
# Handling Outlier

def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3-Q1

    outliers = df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))]
    return outliers
    
def print_outlier(df):
    for i in df.columns:
        outliers = check_outlier(df[i])
        print(f'Number of Outliers for {i:16}:  {len(outliers)}')

def handle_outlier(df):
    for i in df.columns:
            Q1 = df[i].quantile(0.25)
            Q3 = df[i].quantile(0.75)
            IQR = Q3-Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            # Ku gunain teknik winsorizing: jadi nilai outlier akan diubah menjadi nilai lower bound atau upper bound tergantung dengan lokasi outliernya dimana
            df[i] = np.where(df[i] < lower_bound, lower_bound, df[i])
            df[i] = np.where(df[i] > upper_bound, upper_bound, df[i])
        
            print(f'Column {i} has been processed')

print_outlier(df)

Number of Outliers for ph              :  142
Number of Outliers for Hardness        :  83
Number of Outliers for Solids          :  47
Number of Outliers for Chloramines     :  61
Number of Outliers for Sulfate         :  264
Number of Outliers for Conductivity    :  11
Number of Outliers for Organic_carbon  :  25
Number of Outliers for Trihalomethanes :  54
Number of Outliers for Turbidity       :  19
Number of Outliers for Potability      :  0


In [33]:
print(handle_outlier(df))

Column ph has been processed
Column Hardness has been processed
Column Solids has been processed
Column Chloramines has been processed
Column Sulfate has been processed
Column Conductivity has been processed
Column Organic_carbon has been processed
Column Trihalomethanes has been processed
Column Turbidity has been processed
Column Potability has been processed
None


In [34]:
df['Potability'].value_counts()

Potability
0.0    1998
1.0    1278
Name: count, dtype: int64

In [35]:
# Data Scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

X = df.drop(['Potability'], axis = 1)
y = df['Potability']

X = sc.fit_transform(X)

In [36]:
# Model Selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, test_size= 0.2)

# Smote (only oversample the training data)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [37]:
# Creating the model (KNN)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)

KFold = StratifiedKFold(n_splits=5)
scores = cross_val_score(KNN, X, y, cv = KFold)

print('Cross Validation Score:', scores)
print('Average Cross Validation Score:', scores.mean())


Cross Validation Score: [0.57926829 0.60763359 0.60152672 0.55877863 0.61221374]
Average Cross Validation Score: 0.5918841928877304


In [38]:
# Prediction
from sklearn.metrics import accuracy_score, classification_report
y_pred_KNN = KNN.predict(X_test)

print('The Model accuracy is:', accuracy_score(y_pred_KNN, y_test))
print(classification_report(y_pred_KNN, y_test))


The Model accuracy is: 0.614329268292683
              precision    recall  f1-score   support

         0.0       0.62      0.73      0.67       353
         1.0       0.60      0.49      0.54       303

    accuracy                           0.61       656
   macro avg       0.61      0.61      0.60       656
weighted avg       0.61      0.61      0.61       656



In [39]:
# Creating naive bayes model
from sklearn.naive_bayes import BernoulliNB, GaussianNB, CategoricalNB
NaiveBayes = BernoulliNB()
NaiveBayes.fit(X_train, y_train)

KFold2 = StratifiedKFold(n_splits=5)
scores2 = cross_val_score(NaiveBayes, X, y, cv = KFold2)
print('Cross Validation Score:', scores2)
print('Average Cross Validation Score:', scores2.mean())

Cross Validation Score: [0.63109756 0.63664122 0.60305344 0.60305344 0.62137405]
Average Cross Validation Score: 0.6190439396760379


In [40]:
# Prediction
y_pred_NAIVE = NaiveBayes.predict(X_test)
print('The Model accuracy is:', accuracy_score(y_pred_NAIVE, y_test))
print(classification_report(y_pred_NAIVE, y_test))



The Model accuracy is: 0.6295731707317073
              precision    recall  f1-score   support

         0.0       0.95      0.64      0.76       615
         1.0       0.09      0.51      0.15        41

    accuracy                           0.63       656
   macro avg       0.52      0.57      0.46       656
weighted avg       0.90      0.63      0.72       656



In [41]:
# SVM Model plus regularisasi
from sklearn.svm import LinearSVC
l1 = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol = 1e-3)
l1.fit(X_train, y_train)

KFold3 = StratifiedKFold(n_splits=5)
scores3 = cross_val_score(l1, X, y, cv = KFold3)

print('Cross Validation Score:', scores3)
print('Average Cross Validation Score:', scores3.mean())

Cross Validation Score: [0.6097561  0.61068702 0.61068702 0.61068702 0.61068702]
Average Cross Validation Score: 0.6105008378328058


In [44]:
# Prediction
y_pred_SVM = l1.predict(X_test)
print('The Model accuracy is:', accuracy_score(y_pred_SVM, y_test))
print(classification_report(y_pred_SVM, y_test))

The Model accuracy is: 0.5060975609756098
              precision    recall  f1-score   support

         0.0       0.52      0.63      0.57       338
         1.0       0.49      0.37      0.42       318

    accuracy                           0.51       656
   macro avg       0.50      0.50      0.50       656
weighted avg       0.50      0.51      0.50       656

