In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")
from sklearn.model_selection import train_test_split
from operator import itemgetter
import matplotlib.ticker as ticker
import math
from enum import Enum
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from typing import Dict, Tuple
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, fbeta_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, LeavePOut, ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve

In [3]:
filename = 'water_potability.csv'
data = pd.read_csv(filename)

In [4]:
data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [5]:
data.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [6]:
data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [7]:
# проверим пропущенные значения
data.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [8]:
strategies=['mean', 'median', 'most_frequent']

In [9]:
# импьютация нужной колонки с помощью нужной стратегии
def func_impute_col(dataset, column, strategy_param):
    temp_data = dataset[[column]]
    
    imp_num = SimpleImputer(strategy=strategy_param)
    data_num_imp = imp_num.fit_transform(temp_data)
    
    return data_num_imp

In [10]:
# замена медианой pH
col_imp = func_impute_col(data, 'ph', strategies[1])
data[['ph']] = col_imp

In [11]:
# замена медианой Sulfate
col_imp = func_impute_col(data, 'Sulfate', strategies[1])
data[['Sulfate']] = col_imp

In [12]:
# замена медианой Sulfate
col_imp = func_impute_col(data, 'Trihalomethanes', strategies[1])
data[['Trihalomethanes']] = col_imp

In [13]:
# проверим пропущенные значения
data.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [14]:
# целевой признак - Potability - указывает, безопасна ли вода для потребления человеком 
target = data['Potability']

In [15]:
data.corr()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
ph,1.0,0.07576,-0.082004,-0.031741,0.014178,0.017466,0.04024,0.003145,-0.036107,-0.003014
Hardness,0.07576,1.0,-0.046899,-0.030054,-0.092833,-0.023915,0.00361,-0.012707,-0.014449,-0.013837
Solids,-0.082004,-0.046899,1.0,-0.070148,-0.149747,0.013831,0.010242,-0.008799,0.019546,0.033743
Chloramines,-0.031741,-0.030054,-0.070148,1.0,0.023762,-0.020486,-0.012653,0.016614,0.002363,0.023779
Sulfate,0.014178,-0.092833,-0.149747,0.023762,1.0,-0.014182,0.027102,-0.025657,-0.009767,-0.020476
Conductivity,0.017466,-0.023915,0.013831,-0.020486,-0.014182,1.0,0.020966,0.001184,0.005798,-0.008128
Organic_carbon,0.04024,0.00361,0.010242,-0.012653,0.027102,0.020966,1.0,-0.012958,-0.027308,-0.030001
Trihalomethanes,0.003145,-0.012707,-0.008799,0.016614,-0.025657,0.001184,-0.012958,1.0,-0.021487,0.006887
Turbidity,-0.036107,-0.014449,0.019546,0.002363,-0.009767,0.005798,-0.027308,-0.021487,1.0,0.001581
Potability,-0.003014,-0.013837,0.033743,0.023779,-0.020476,-0.008128,-0.030001,0.006887,0.001581,1.0


In [16]:
# уберем столбцы, слабо коррелирующие с целевым признаком
data_clean = data
data_clean = data_clean.drop(columns = ['ph'], axis = 1)
data_clean = data_clean.drop(columns = ['Conductivity'], axis = 1)
data_clean = data_clean.drop(columns = ['Trihalomethanes'], axis = 1)
data_clean = data_clean.drop(columns = ['Turbidity'], axis = 1)
data_clean = data_clean.drop(columns = ['Hardness'], axis = 1)

In [17]:
x_train,x_test,y_train,y_test=train_test_split(data_clean,target,test_size=0.2,random_state=1)


In [18]:
def regr_to_class(y: int) -> str:
    if y==1:
        return 'Is_safe'
    else:
        return 'Not_safe'

In [28]:
# формирование целевого признака для классификации
x_train['Potability_class'] = \
x_train.apply(lambda row: regr_to_class(row['Potability']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Potability_class'] = \


In [29]:
x_train

Unnamed: 0,Solids,Chloramines,Sulfate,Organic_carbon,Potability,Potability_class
675,20225.642923,7.569518,352.685843,19.621923,1,Is_safe
1359,6626.376426,5.117364,372.625257,23.135952,0,Not_safe
1391,16795.318674,6.708068,326.952617,12.510887,0,Not_safe
1727,18535.046452,5.497159,339.838977,14.979000,0,Not_safe
1677,30210.250367,9.508299,340.245766,23.234326,0,Not_safe
...,...,...,...,...,...,...
2763,23347.172710,9.000395,333.073546,14.173906,1,Is_safe
905,34160.925144,8.963156,363.472798,15.905270,0,Not_safe
1096,45141.686036,6.030640,240.198505,20.605552,1,Is_safe
235,17650.405049,8.121080,350.487939,10.999416,0,Not_safe


In [30]:
class PredictionType(Enum):
    CLASSIFICATION = 1
    REGRESSION = 2

In [31]:
class SimpleKNN:
    
    def fit(self, X_train: np.matrix, y_train: np.ndarray):
        """
        Метод обучения, который фактически не учится, 
        а только запоминает обучающую выборку.
        Входные параметры:
        X_train - обучающая выборка (матрица объект-признак)
        y_train - обучающая выборка (вектор целевого признака)
        Возвращаемое значение: нет
        """
        #Сохраняем параметры в переменных класса
        self._X_train = X_train
        self._y_train = y_train
          
    def eucl_dist(self, p: np.ndarray, q: np.ndarray) -> float:
        """
        Вычисление Евклидова расстояния - https://en.wikipedia.org/wiki/Euclidean_distance 
        Входные параметры:
        p, q - вектора в n-мерном пространстве признаков
        """
        return math.sqrt(sum([(pi - qi) ** 2 for pi, qi in zip (p, q)]))
            
            
    def predict_for_single_object(self, K: int, \
                prediction_type: PredictionType, \
                X_o: np.ndarray, \
                verbose = True) -> np.ndarray:
        """
        Метод предсказания для одного объекта.
        Входные параметры:
        K - гиперпараметр, количество соседей 
        prediction_type - классификация или регрессия 
        X_o - строка матрицы объект-признак, соответствующая объекту
        verbose - флаг детального вывода
        Возвращаемое значение: предсказанное значение целевого признака
        """
        # список соседей
        neighbors_list = []
        # *** Находим ближайшие точки ***
        # Перебираем все точки обучающей выборки
        for i in range(self._X_train.shape[0]):
            # получаем текущую точку
            data_train_current_x = [x for x in self._X_train[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']].iloc[i]]
            # и значение ее y
            data_train_current_y = self._y_train.values[i]
            # вычисляем расстояние
            dist = self.eucl_dist(X_o, data_train_current_x)
            # сохраняем в список соседей
            temp_res = (data_train_current_y, dist, data_train_current_x)
            neighbors_list.append(temp_res)
        # *** сортируем список соседей по возрастанию расстояния *** 
        # в кортеже элементы следуют в порядке (0,1,2), сортируем по первому элементу 
        neighbors_list_sorted = sorted(neighbors_list, key=itemgetter(1))
        if verbose:
            print()
            print('**************************************')
            print('Проверяемая точка: ', X_o)
            print('**************************************')
            print('Вывод отсортированного списка соседей:')    
            dist_list = []
            for cur_y, cur_dist, temp_x_1_2 in neighbors_list_sorted:
                temp_x1, temp_x2 = temp_x_1_2
                print('X1={0}, X2={1}, y={2}, расстояние={3:.2f}'.format(temp_x1, temp_x2, cur_y, cur_dist))
                dist_list.append(cur_dist)
            print()
            print('Вывод расстояния для отсортированного списка соседей:')    
            plt.plot(dist_list)
            plt.show()
        # Оставим только K ближайших соседей
        K_neighbors_list_sorted = neighbors_list_sorted[:K]
        if verbose:
            print('Вывод К ближайших соседей:')
            x1_list = []
            x2_list = []
            for cur_y, cur_dist, temp_x_1_2 in K_neighbors_list_sorted:
                temp_x1, temp_x2 = temp_x_1_2
                x1_list.append(temp_x1)
                x2_list.append(temp_x2)
                print('X1={0}, X2={1}, y={2}, расстояние={3:.2f}'.format(temp_x1, temp_x2, cur_y, cur_dist))
            print()
            print('Визуализация К ближайших соседей:')
            plt.plot(self._X_train['x1'], self._X_train['x2'], 'b.', \
                     x1_list, x2_list,  'g*', \
                    [X_o[0]], [X_o[1]], 'ro')
            plt.show()   
        # Результат - классификация или регрессия
        if prediction_type == PredictionType.REGRESSION:
            # используем numpy для вычисления среднего значения
            arr = np.array([x for x,_,_ in K_neighbors_list_sorted])
            # возвращаем среднее значение
            return np.mean(arr)          
        elif prediction_type == PredictionType.CLASSIFICATION:
            k_y_list = [y for y,_,_ in K_neighbors_list_sorted]
            # группируем с количеством метки классов,
            # соответствующие K ближайшим соседям
            k_y_list_grouped_temp = np.unique(k_y_list, return_counts=True)
            k_y_list_grouped = [[key, cnt] for key, cnt in zip(k_y_list_grouped_temp[0], k_y_list_grouped_temp[1])]
            # сортируем по количеству по убыванию
            k_y_list_grouped_sorted = sorted(k_y_list_grouped, key=itemgetter(1), reverse=True)
            if verbose:
                print('Классы, соответствующие К ближайшим соседям:')
                for i in k_y_list_grouped_sorted:
                    print('класс={0}, количество элементов={1}'.format(i[0], i[1]))
            # возвращаеv метку класса из первой строки отсортированного массива
            # то есть того класса, к которому принадлежит наибольшее количество соседей
            return k_y_list_grouped_sorted[0][0]
        else:
            raise Exception('Неизвестный тип предсказания')
                   
    
    def predict(self, K: int, \
                prediction_type: PredictionType, \
                X_test: np.matrix, 
                verbose = True) -> np.ndarray:
        """
        Метод предсказания.
        Входные параметры:
        K - гиперпараметр, количество соседей 
        prediction_type - классификация или регрессия 
        X_test - тестовая выборка (матрица объект-признак)
        Возвращаемое значение: предсказанный вектор целевого признака
        """
        # Перебираем все точки тестовой выборки
        test_data_temp = []
        for i in range(X_test.shape[0]):
            # получаем текущую точку
            data_test_current_x = [x for x in X_test.iloc[i]]
            test_data_temp.append(data_test_current_x)    
   
        return [self.predict_for_single_object(K=K, \
                prediction_type=prediction_type, \
                X_o=i, verbose=verbose) for i in test_data_temp]

In [32]:
x_train.head()

Unnamed: 0,Solids,Chloramines,Sulfate,Organic_carbon,Potability,Potability_class
675,20225.642923,7.569518,352.685843,19.621923,1,Is_safe
1359,6626.376426,5.117364,372.625257,23.135952,0,Not_safe
1391,16795.318674,6.708068,326.952617,12.510887,0,Not_safe
1727,18535.046452,5.497159,339.838977,14.979,0,Not_safe
1677,30210.250367,9.508299,340.245766,23.234326,0,Not_safe


In [33]:
# классификация
simple_knn_class = SimpleKNN()
simple_knn_class.fit(x_train[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']], x_train['Potability_class'])

In [34]:
simple_knn_class._X_train


Unnamed: 0,Solids,Chloramines,Sulfate,Organic_carbon
675,20225.642923,7.569518,352.685843,19.621923
1359,6626.376426,5.117364,372.625257,23.135952
1391,16795.318674,6.708068,326.952617,12.510887
1727,18535.046452,5.497159,339.838977,14.979000
1677,30210.250367,9.508299,340.245766,23.234326
...,...,...,...,...
2763,23347.172710,9.000395,333.073546,14.173906
905,34160.925144,8.963156,363.472798,15.905270
1096,45141.686036,6.030640,240.198505,20.605552
235,17650.405049,8.121080,350.487939,10.999416


In [35]:
simple_knn_class._y_train

675      Is_safe
1359    Not_safe
1391    Not_safe
1727    Not_safe
1677    Not_safe
          ...   
2763     Is_safe
905     Not_safe
1096     Is_safe
235     Not_safe
1061    Not_safe
Name: Potability_class, Length: 2620, dtype: object

In [36]:
# первая строка тестовой выборки
data_test_0 = [x for x in x_test[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']].iloc[0]]
data_test_0

[14775.14559575291, 7.484103646670236, 305.82855327870027, 12.309015955365226]

In [37]:
# построим предсказание для одного объекта (классификация)
simple_knn_class_0 = simple_knn_class.predict_for_single_object(K=5, \
                          prediction_type=PredictionType.CLASSIFICATION, \
                          X_o=data_test_0, verbose=False)
simple_knn_class_0

'Not_safe'

In [38]:
model = KNeighborsClassifier(n_neighbors=3)
model

KNeighborsClassifier(n_neighbors=3)

In [39]:
model.fit(x_train[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']], y_train)
m_predict = model.predict(x_test[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']])
m_predict_train = model.predict(x_train[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']])

In [40]:
model2 = KNeighborsClassifier(n_neighbors=10)
model2.fit(x_train[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']], y_train)
m2_predict = model2.predict(x_test[['Solids','Chloramines', 'Sulfate', 'Organic_carbon']])

In [41]:
# 1 параметр - эталонное значение классов из исходной (тестовой) выборки
# 2 параметр - предсказанное значение классов

# 3 ближайших соседа
accuracy_score(y_test, m_predict)

0.5274390243902439

In [42]:
# 10 ближайших соседей
accuracy_score(y_test, m2_predict)

0.5685975609756098

In [43]:
# 3 ближайших соседа
balanced_accuracy_score(y_test, m_predict)

0.5000426301878571

In [44]:
# 10 ближайших соседей
balanced_accuracy_score(y_test, m2_predict)

0.5170520751428112

In [45]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [46]:
# 3 ближайших соседа
print_accuracy_score_for_classes(y_test, m_predict)

Метка 	 Accuracy
0 	 0.6997319034852547
1 	 0.3003533568904594


In [47]:
# 10 ближайших соседей
print_accuracy_score_for_classes(y_test, m2_predict)

Метка 	 Accuracy
0 	 0.8927613941018767
1 	 0.1413427561837456
