## Tratamento e Análise exploratória dos dados usando a API do pandas no spark

In [0]:
import pyspark.pandas as ps

In [0]:
base_cancer = ('/mnt/Databricks_Dados/bronze/Cancer_Data.csv')
cancer_df = ps.read_csv(base_cancer)

In [0]:
cancer_df.head()

In [0]:
cancer_df.info()

In [0]:
cancer_df.columns

In [0]:
cancer_df = cancer_df.drop(['_c32'], axis=1)

In [0]:
cancer_df.columns

In [0]:
cancer_df = cancer_df.drop('id', axis=1)

In [0]:
cancer_df.columns

In [0]:
type(cancer_df)

In [0]:
X_cancer = cancer_df.iloc[:, 1:].to_numpy()

In [0]:
X_cancer

In [0]:
y_cancer = cancer_df.iloc[:, 0].to_numpy()

In [0]:
y_cancer

In [0]:
from sklearn.preprocessing import StandardScaler
scaler_cancer = StandardScaler()

In [0]:
X_cancer = scaler_cancer.fit_transform(X_cancer)

In [0]:
from sklearn.preprocessing import LabelEncoder
label_encoder_diagnosis = LabelEncoder()

In [0]:
y_cancer = label_encoder_diagnosis.fit_transform(y_cancer)

In [0]:
y_cancer

In [0]:
import numpy as np

In [0]:
unique_diagnosis = np.unique(cancer_df['diagnosis'].to_numpy())
unique_diagnosis

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_cancer_treinamento, X_cancer_teste, y_cancer_treinamento, y_cancer_teste = train_test_split(X_cancer, y_cancer, test_size = 0.15, random_state = 0)

In [0]:
X_cancer_treinamento.shape, y_cancer_treinamento.shape

In [0]:
X_cancer_teste.shape, y_cancer_teste.shape

# Etapa da aplicação de técnicas de Machine Learning

In [0]:
import pickle

In [0]:
with open('cancer.pkl', mode = 'wb') as f:
  pickle.dump([X_cancer_treinamento, X_cancer_teste, y_cancer_treinamento, y_cancer_teste], f)

###Naïve Bayes

In [0]:
from sklearn.naive_bayes import GaussianNB

In [0]:
with open('cancer.pkl', 'rb') as f:
  X_cancer_treinamento, X_cancer_teste, y_cancer_treinamento, y_cancer_teste = pickle.load(f)

In [0]:
X_cancer_treinamento.shape, y_cancer_treinamento.shape

In [0]:
X_cancer_teste.shape, y_cancer_teste.shape

In [0]:
naive_cancer = GaussianNB()
naive_cancer.fit(X_cancer_treinamento, y_cancer_treinamento)
previsoes = naive_cancer.predict(X_cancer_teste)
previsoes

In [0]:
y_cancer_teste

In [0]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [0]:
accuracy_score(y_cancer_teste, previsoes)

In [0]:
confusion_matrix(y_cancer_teste, previsoes)

In [0]:
from yellowbrick.classifier import ConfusionMatrix

In [0]:
cm = ConfusionMatrix(naive_cancer) 
cm.fit(X_cancer_treinamento, y_cancer_treinamento)
cm.score(X_cancer_teste, y_cancer_teste)

In [0]:
print(classification_report(y_cancer_teste, previsoes))

### Árvore de decisão

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
arvore_cancer = DecisionTreeClassifier(criterion='entropy', random_state=0)

In [0]:
arvore_cancer.fit(X_cancer_treinamento, y_cancer_treinamento)

In [0]:
previsoes = arvore_cancer.predict(X_cancer_teste)
previsoes

In [0]:
y_cancer_teste

In [0]:
accuracy_score(y_cancer_teste, previsoes)

In [0]:
from yellowbrick.classifier import ConfusionMatrix

In [0]:
cm = ConfusionMatrix(arvore_cancer)
cm.fit(X_cancer_treinamento, y_cancer_treinamento)
cm.score(X_cancer_teste, y_cancer_teste)

In [0]:
print(classification_report(y_cancer_teste, previsoes))

### Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
random_forest_cancer = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state = 0)

In [0]:
random_forest_cancer.fit(X_cancer_treinamento, y_cancer_treinamento)

In [0]:
previsoes = random_forest_cancer.predict(X_cancer_teste) 
previsoes

In [0]:
y_cancer_teste

In [0]:
accuracy_score(y_cancer_teste, previsoes)

In [0]:
cm = ConfusionMatrix(random_forest_cancer) 
cm.fit(X_cancer_treinamento, y_cancer_treinamento)
cm.score(X_cancer_teste, y_cancer_teste)

In [0]:
print(classification_report(y_cancer_teste, previsoes))

###kNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
with open('cancer.pkl', 'rb') as f:
  X_cancer_treinamento, X_cancer_teste, y_cancer_treinamento, y_cancer_teste = pickle.load(f)

In [0]:
X_cancer_treinamento.shape, y_cancer_treinamento.shape

In [0]:
X_cancer_teste.shape, y_cancer_teste.shape

In [0]:
knn_cancer = KNeighborsClassifier(n_neighbors=10)

In [0]:
knn_cancer.fit(X_cancer_treinamento, y_cancer_treinamento)

In [0]:
previsoes = knn_cancer.predict(X_cancer_teste)
previsoes

In [0]:
y_cancer_teste

In [0]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_cancer_teste, previsoes)

In [0]:
from yellowbrick.classifier import ConfusionMatrix 
cm = ConfusionMatrix(knn_cancer) 
cm.fit(X_cancer_treinamento, y_cancer_treinamento)
cm.score(X_cancer_teste, y_cancer_teste)

In [0]:
print(classification_report(y_cancer_teste, previsoes))

###SVM

In [0]:
from sklearn.svm import SVC

In [0]:
with open('cancer.pkl', 'rb') as f:
  X_cancer_treinamento, X_cancer_teste, y_cancer_treinamento, y_cancer_teste = pickle.load(f)

In [0]:
svm_cancer = SVC(kernel='linear', random_state=1)

In [0]:
svm_cancer.fit(X_cancer_treinamento, y_cancer_treinamento)

In [0]:
previsoes = svm_cancer.predict(X_cancer_teste)  
previsoes

In [0]:
y_cancer_teste

In [0]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_cancer_teste, previsoes)

In [0]:
from yellowbrick.classifier import ConfusionMatrix 
cm = ConfusionMatrix(svm_cancer) 
cm.fit(X_cancer_treinamento, y_cancer_treinamento)  
cm.score(X_cancer_teste, y_cancer_teste)

###Redes Neurais

In [0]:
from sklearn.neural_network import MLPClassifier

In [0]:
with open('cancer.pkl', 'rb') as f:
  X_cancer_treinamento, X_cancer_teste, y_cancer_treinamento, y_cancer_teste = pickle.load(f)

In [0]:
rede_neural_cancer = MLPClassifier(max_iter=1500, verbose=True, tol=0.0000100,
                                   solver = 'adam', activation = 'relu',
                                   hidden_layer_sizes = (20,20))
rede_neural_cancer.fit(X_cancer_treinamento, y_cancer_treinamento)

In [0]:
previsoes = rede_neural_cancer.predict(X_cancer_teste) 
previsoes

In [0]:
y_cancer_teste

In [0]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_cancer_teste, previsoes)

In [0]:
from yellowbrick.classifier import ConfusionMatrix 
cm = ConfusionMatrix(rede_neural_cancer)
cm.fit(X_cancer_treinamento, y_cancer_treinamento)
cm.score(X_cancer_teste, y_cancer_teste)

###Tuning dos parâmetros com GridSearch

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [0]:
import pickle
with open('cancer.pkl', 'rb') as f:  
  X_cancer_treinamento, y_cancer_treinamento, X_cancer_teste, y_cancer_teste = pickle.load(f)

In [0]:
X_cancer_treinamento.shape, y_cancer_treinamento.shape

In [0]:
X_cancer_teste.shape, y_cancer_teste.shape

###Na utilização de validação cruzada, nós utilizamos a base de dados inteira pois, durante o processo de validação cruzada o algoritmo fará a quebra da base de dados

In [0]:
X_cancer = np.concatenate((X_cancer_treinamento, X_cancer_teste), axis = 0)  # Concatenando a base de dados para que ela fique completa (axis = 0 -> linhas)
X_cancer.shape

In [0]:
X_cancer

In [0]:
y_cancer = np.concatenate((y_cancer_treinamento, y_cancer_teste), axis = 0)  # Concatenando a base de dados para que ela fique completa (axis = 0 -> linhas)
y_cancer.shape

In [0]:
y_cancer

###Árvore de decisão

In [0]:
parametros = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],  
              'min_samples_split': [2, 5, 10],  
              'min_samples_leaf': [1, 5, 10]}

In [0]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parametros)
grid_search.fit(X_cancer, y_cancer)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

###Random Forest

In [0]:
parametros = {'criterion': ['gini', 'entropy'],  
              'n_estimators': [10, 40, 100, 150],  
              'min_samples_split': [2, 5, 10],  
              'min_samples_leaf': [1, 5, 10]}

In [0]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parametros) 
grid_search.fit(X_cancer, y_cancer)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

###kNN

In [0]:
parametros = {'n_neighbors': [3, 5, 10, 20],
              'p': [1, 2]}

In [0]:
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parametros)
grid_search.fit(X_cancer, y_cancer)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

###SVM

In [0]:
parametros = {'tol': [0.001, 0.0001, 0.00001],  
              'C': [1.0, 1.5, 2.0],  
              'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

In [0]:
grid_search = GridSearchCV(estimator=SVC(), param_grid=parametros)  
grid_search.fit(X_cancer, y_cancer)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

###Redes neurais

In [0]:
parametros = {'activation': ['relu', 'logistic', 'tahn'],
              'solver': ['adam', 'sgd'],
              'batch_size': [10, 56]}

In [0]:
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=parametros)
grid_search.fit(X_cancer, y_cancer)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_

In [0]:
print(melhores_parametros)
print(melhor_resultado)