In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import scipy as scipy
import matplotlib as matplot
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.impute import SimpleImputer

from sklearn.tree import _tree
from sklearn import tree

from scipy.stats import pearsonr

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.precision", 4)
np.random.seed(2021)

import warnings
warnings.filterwarnings('ignore')


In [2]:
df=pd.read_csv("water_potability.csv")

In [3]:
def numeric_type_conversion(arr):
    for col in arr:
        df[col] = df[col].astype('float64')

In [4]:
arr=list(df.columns)
arr.pop()
numeric_type_conversion(arr)
df["Potability"] = df["Potability"].astype('category')

In [5]:
arr

['ph',
 'Hardness',
 'Solids',
 'Chloramines',
 'Sulfate',
 'Conductivity',
 'Organic_carbon',
 'Trihalomethanes',
 'Turbidity']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   ph               2785 non-null   float64 
 1   Hardness         3276 non-null   float64 
 2   Solids           3276 non-null   float64 
 3   Chloramines      3276 non-null   float64 
 4   Sulfate          2495 non-null   float64 
 5   Conductivity     3276 non-null   float64 
 6   Organic_carbon   3276 non-null   float64 
 7   Trihalomethanes  3114 non-null   float64 
 8   Turbidity        3276 non-null   float64 
 9   Potability       3276 non-null   category
dtypes: category(1), float64(9)
memory usage: 233.8 KB


In [7]:
cols_with_missing = [col for col in df.columns
                     if df[col].isnull().any()]
len(cols_with_missing),cols_with_missing

(3, ['ph', 'Sulfate', 'Trihalomethanes'])

In [8]:
for col in df.columns:
    print(col," : ",df[col].isnull().sum())

ph  :  491
Hardness  :  0
Solids  :  0
Chloramines  :  0
Sulfate  :  781
Conductivity  :  0
Organic_carbon  :  0
Trihalomethanes  :  162
Turbidity  :  0
Potability  :  0


In [9]:
my_imputer = SimpleImputer()
df[arr] = pd.DataFrame(my_imputer.fit_transform(df[arr]))

In [10]:
for col in df.columns:
    print(col," : ",df[col].isnull().sum())

ph  :  0
Hardness  :  0
Solids  :  0
Chloramines  :  0
Sulfate  :  0
Conductivity  :  0
Organic_carbon  :  0
Trihalomethanes  :  0
Turbidity  :  0
Potability  :  0


In [11]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.0808,204.8905,20791.319,7.3002,368.5164,564.3087,10.3798,86.991,2.9631,0
1,3.7161,129.4229,18630.0579,6.6352,333.7758,592.8854,15.18,56.3291,4.5007,0
2,8.0991,224.2363,19909.5417,9.2759,333.7758,418.6062,16.8686,66.4201,3.0559,0
3,8.3168,214.3734,22018.4174,8.0593,356.8861,363.2665,18.4365,100.3417,4.6288,0
4,9.0922,181.1015,17978.9863,6.5466,310.1357,398.4108,11.5583,31.998,4.0751,0


In [12]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [13]:
clf = DecisionTreeClassifier(max_depth=3, random_state=1234)
model = clf.fit(X, y)

In [14]:
text_representation = tree.export_text(clf,feature_names=arr)
print(text_representation)

|--- Sulfate <= 258.97
|   |--- Solids <= 21161.41
|   |   |--- ph <= 7.95
|   |   |   |--- class: 0
|   |   |--- ph >  7.95
|   |   |   |--- class: 1
|   |--- Solids >  21161.41
|   |   |--- ph <= 5.83
|   |   |   |--- class: 0
|   |   |--- ph >  5.83
|   |   |   |--- class: 1
|--- Sulfate >  258.97
|   |--- Sulfate <= 387.80
|   |   |--- ph <= 4.64
|   |   |   |--- class: 0
|   |   |--- ph >  4.64
|   |   |   |--- class: 0
|   |--- Sulfate >  387.80
|   |   |--- ph <= 7.61
|   |   |   |--- class: 1
|   |   |--- ph >  7.61
|   |   |   |--- class: 0



In [15]:
def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules


In [16]:
rules = get_rules(clf, arr,["Non drinkable","Drinkable"])
for r in rules:
    print(r)


if (Sulfate > 258.97) and (Sulfate <= 387.796) and (ph > 4.636) then class: Non drinkable (proba: 62.19%) | based on 2,809 samples
if (Sulfate > 258.97) and (Sulfate > 387.796) and (ph <= 7.61) then class: Drinkable (proba: 66.04%) | based on 159 samples
if (Sulfate > 258.97) and (Sulfate <= 387.796) and (ph <= 4.636) then class: Non drinkable (proba: 79.45%) | based on 146 samples
if (Sulfate > 258.97) and (Sulfate > 387.796) and (ph > 7.61) then class: Non drinkable (proba: 78.08%) | based on 73 samples
if (Sulfate <= 258.97) and (Solids > 21161.408) and (ph > 5.825) then class: Drinkable (proba: 94.34%) | based on 53 samples
if (Sulfate <= 258.97) and (Solids <= 21161.408) and (ph <= 7.947) then class: Non drinkable (proba: 73.91%) | based on 23 samples
if (Sulfate <= 258.97) and (Solids > 21161.408) and (ph <= 5.825) then class: Non drinkable (proba: 50.0%) | based on 8 samples
if (Sulfate <= 258.97) and (Solids <= 21161.408) and (ph > 7.947) then class: Drinkable (proba: 100.0%) |