In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import matplotlib


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
#Dataset
df = pd.read_csv('./bases/water_mod.csv', decimal=',')

In [None]:
df

In [None]:
#Removendo colunas irrelevantes
dfdrop = df.drop(columns=['Solids', 'Conductivity'])

In [None]:
#Verificando se existem valores zerados
df.isnull().sum()

In [None]:
#Removendo os valores zerados
df.dropna(inplace = True)

In [None]:
#Reduzindo casas decimais
df = df.apply(pd.to_numeric)

In [None]:
#Visualizando outliers
df['ph'].plot(kind = 'box')

In [None]:
#Tratando os outliers (foram tratados um por vez)


#dfTeste = df.copy()

q1 = dfTeste['Turbidity'].quantile(0.25)
q3 = dfTeste['Turbidity'].quantile(0.75)
iqr = q3 - q1
lLim = q1 - 1.5 * iqr 
hLim = q3 + 1.5 * iqr
dfTeste.loc[dfTeste['Turbidity'] < lLim, 'Turbidity'] = lLim #substitui os valores abaixo do limite inferior pelo limite inferior
dfTeste.loc[dfTeste['Turbidity'] > hLim, 'Turbidity'] = hLim #substitui os valores acima do limite superior pelo limite superior

print(dfTeste)

In [None]:
dfTeste['ph'].plot(kind = 'box')
df = dfTeste

In [None]:
#Verificando o balanceamento
df.Potability.value_counts()

In [None]:
#Tratando os outliers (foram tratados um por vez)


#dfTeste = df.copy()

q1 = dfTeste['Turbidity'].quantile(0.25)
q3 = dfTeste['Turbidity'].quantile(0.75)
iqr = q3 - q1
lLim = q1 - 1.5 * iqr 
hLim = q3 + 1.5 * iqr
dfTeste.loc[dfTeste['Turbidity'] < lLim, 'Turbidity'] = lLim #substitui os valores abaixo do limite inferior pelo limite inferior
dfTeste.loc[dfTeste['Turbidity'] > hLim, 'Turbidity'] = hLim #substitui os valores acima do limite superior pelo limite superior

print(dfTeste)

In [None]:
#Verificando outliers pós tratamento
dfTeste['Turbidity'].plot(kind = 'box')

In [None]:
#Balanceando linhas de acordo com coluna alvo
from imblearn.under_sampling import NearMiss 

X, y = NearMiss().fit_resample(df.drop(columns=['Potability']), df['Potability'])

In [None]:
dfTeste = pd.DataFrame(X, columns=df.columns.drop('Potability'))
dfTeste['Potability'] = y
print(dfTeste)

In [None]:
dfTeste['Potability'].value_counts()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

class runmodel:
  '''
    Parâmetros de entrada:

    X: atributos de entradas
    y: atributo alvo
    model: algoritmo para construção do estimador
    cv: se None faz N holdouts, se int faz validação cruzada (ambos com estratificação por classe)
    n = quantidade de repetições da amostragem

    Guardar as métricas de cada execução (armazenametricas) e 
    Mostrar os valores médios com desvio padrão para as métricas (mostraresultadomedio) 

    Se dataset tiver mais de 2 classes, adaptar resultados    
  '''

  def __init__(self, X, y, model, cv = None, n = 1):
    
    self.__resultados = {
      'precision_0': [],
      'recall_0': [],
      'f1_0': [],
      'support_0': [],
      'precision_1': [],
      'recall_1': [],
      'f1_1': [],
      'support_1' : [],
      'accuracy' : [],
      'precision_wavg' : [],
      'recall_wavg' : [],
      'f1_wavg' : [],
      'support_wavg' : []
    }

    if cv is None:
      print(f'{n} holdouts')
      for i in range(n):
        self.__avaliamodelo(X, y, model)
    
    else:
      print(f'{n} validação cruzada com {cv} folds')
      for i in range(n):
        self.__avaliamodelo_cv(X, y, model, cv)

  @property
  def resultados(self):
    return self.__resultados

  def __avaliamodelo(self, X, y, model):
    #gera as amostras de treino (2/3) e teste (1/3) com estratificação por classe
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y)
    # Treina o modelo usando os dados de treino
    model.fit(X_train,y_train)
    # Testa modelo usando os dados de teste
    pred = model.predict(X_test)
    #armazena resultado
    self.__armazenametricas(metrics.classification_report(y_test,pred, output_dict= True))

  def __avaliamodelo_cv(self, X, y, model, cv):
    #gera as amostras para cv folds com embaralhamento (permite fazer várias validações cruzadas)
    skf = StratifiedKFold(shuffle=True, n_splits=cv)
    #para cada fold: treina, testa e armazena os resultados
    for train_index, test_index in skf.split(X, y):
      model.fit(X.iloc[train_index], y.iloc[train_index])
      pred = model.predict(X.iloc[test_index])
      self.__armazenametricas(metrics.classification_report(y.iloc[test_index], pred, output_dict= True))

  def __armazenametricas(self, d):
    self.__resultados['precision_0'].append(d['0']['precision'])
    self.__resultados['recall_0'].append(d['0']['recall'])
    self.__resultados['f1_0'].append(d['0']['f1-score'])
    self.__resultados['support_0'].append(d['0']['support'])

    self.__resultados['precision_1'].append(d['1']['precision'])
    self.__resultados['recall_1'].append(d['1']['recall'])
    self.__resultados['f1_1'].append(d['1']['f1-score'])
    self.__resultados['support_1'].append(d['1']['support'])

    self.__resultados['accuracy'].append(d['accuracy'])
    
    self.__resultados['precision_wavg'].append(d['weighted avg']['precision'])
    self.__resultados['recall_wavg'].append(d['weighted avg']['recall'])
    self.__resultados['f1_wavg'].append(d['weighted avg']['f1-score'])
    self.__resultados['support_wavg'].append(d['weighted avg']['support'])

  def mostraresultadomedio(self):
    print(f"\t\t precision \t recall \t f1-score \t support\n")
    print(f"0 \t\t {round(np.mean(self.__resultados['precision_0']), 2)}({round(np.std(self.__resultados['precision_0']),2)}) \t {round(np.mean(self.__resultados['recall_0']),2)}({round(np.std(self.__resultados['recall_0']),2)}) \t {round(np.mean(self.__resultados['f1_0']),2)}({round(np.std(self.__resultados['f1_0']),2)}) \t {round(np.mean(self.__resultados['support_0']),2)}({round(np.std(self.__resultados['support_0']),2)})\n")
    print(f"1 \t\t {round(np.mean(self.__resultados['precision_1']), 2)}({round(np.std(self.__resultados['precision_1']),2)}) \t {round(np.mean(self.__resultados['recall_1']),2)}({round(np.std(self.__resultados['recall_1']),2)}) \t {round(np.mean(self.__resultados['f1_1']),2)}({round(np.std(self.__resultados['f1_1']),2)}) \t {round(np.mean(self.__resultados['support_1']),2)}({round(np.std(self.__resultados['support_1']),2)})\n")
    print(f"accuracy \t\t  \t\t  \t {round(np.mean(self.__resultados['accuracy']), 2)}({round(np.std(self.__resultados['accuracy']),2)}) \t\t \n")
    print(f"weighted avg \t {round(np.mean(self.__resultados['precision_wavg']), 2)}({round(np.std(self.__resultados['precision_wavg']),2)}) \t {round(np.mean(self.__resultados['recall_wavg']),2)}({round(np.std(self.__resultados['recall_wavg']),2)}) \t {round(np.mean(self.__resultados['f1_wavg']),2)}({round(np.std(self.__resultados['f1_wavg']),2)}) \t {round(np.mean(self.__resultados['support_wavg']),2)}({round(np.std(self.__resultados['support_wavg']),2)})\n")

Aplicando KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfTeste.drop(columns=['Potability']), dfTeste['Potability'],test_size=0.33, stratify=dfTeste['Potability'])

In [None]:
#define os parâmetros a serem testados com o KNN
param_grid_knn = {'n_neighbors': range(1,40,2), 'weights': ['uniform', 'distance'], 'p': [1, 2, 3]} 
#cria o objeto do gridsearchcv
gridknn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, scoring = 'f1_weighted', cv = 10, verbose = 1)
#executa o gridsearchcv para a base separando X e y
gridknn.fit(X_train,y_train)

In [None]:
gridknn.best_estimator_

In [None]:
gridknn.best_score_

In [None]:
grid_predictions = gridknn.predict(X_test)
print(metrics.classification_report(y_test, grid_predictions))

In [None]:
knn = gridknn.best_estimator_
cross_validation_knn = runmodel(dfTeste.drop(columns=['Potability']), dfTeste['Potability'], knn, cv = 10, n = 1 )

In [None]:
print('\tMédia e desvio padrão do KNN ~ 10 fold cross validation\n')
cross_validation_knn.mostraresultadomedio()

In [None]:
#Observando a Classe
sns.pairplot(data=dfTeste, hue='Potability')

Aplicando Naive

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import seaborn as sns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfTeste.drop(columns=['Potability']), dfTeste['Potability'],test_size=0.33, stratify=dfTeste['Potability'])

In [None]:
model = GaussianNB()
model.fit(X_train,y_train)
pred = model.predict(X_test)

print(metrics.classification_report(y_test, pred))

In [None]:
pred

Teste com validação cruzada

In [None]:
cross_validation_naive = runmodel(dfTeste.drop(columns=['Potability']), dfTeste['Potability'], GaussianNB(), cv = 10, n = 1 )

In [None]:
cross_validation_naive.mostraresultadomedio()

Aplicando SVM

In [None]:
import numpy as np
from sklearn.svm import SVC

In [None]:
#'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [0.1, 0.01, 0.001, 'auto', 'scale'], 
    'kernel': ['sigmoid']
} 
gridsvm = GridSearchCV(SVC(),param_grid, verbose = 3)
gridsvm.fit(X_train,y_train)

In [None]:
gridsvm.best_params_

Teste de validação cruzada

In [None]:
svm = gridsvm.best_estimator_
cross_validation_svm = runmodel(dfTeste.drop(columns=['Potability']), dfTeste['Potability'], svm, cv = 10, n = 1 )
print(metrics.classification_report(y_test, grid_predictions))

In [None]:
grid_predictions = gridsvm.predict(X_test)
cross_validation_svm.mostraresultadomedio()

Rede neural - MLP

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(dfTeste.drop(columns=['Potability']), dfTeste['Potability'],test_size=0.33, stratify=dfTeste['Potability'])

In [None]:
#configuração default
mlp = MLPClassifier()
mlp.fit(X_train, y_train)

In [None]:
from sklearn import metrics
pred = mlp.predict(X_test)
print(metrics.classification_report(y_test, pred))

In [None]:
param_grid_mlp = {
    'hidden_layer_sizes': [(100, ), (7,)], #default e heurístico
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [200, 1000, 5000, 10000]
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

gridmlp = GridSearchCV(MLPClassifier(),param_grid_mlp, verbose = 1)

gridmlp.fit(X_train,y_train)

In [None]:
gridmlp.best_params_

In [None]:
gridmlp.best_estimator_

In [None]:
gridmlp.best_score_

Teste de validação cruzada

In [None]:
mlp = gridmlp.best_estimator_
cross_validation_mlp = runmodel(dfTeste.drop(columns=['Potability']), dfTeste['Potability'], mlp, cv = 10, n = 1 )

In [None]:
print('\tMédia e desvio padrão do MLP com 10 fold cross validation\n')
cross_validation_mlp.mostraresultadomedio()