# Detecção de câncer de pele utilizando imagens 

**Load Libraries**

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
import time
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from google.colab import files
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.datasets import load_digits
from sklearn.neural_network import MLPClassifier

**Load Data**

In [0]:
# Put your code here

from sklearn.datasets import load_digits
import pandas as pd
import matplotlib.pyplot as plt

dataSkin = pd.read_csv("DadosImg.txt", sep=' ')
label = pd.read_csv("label.txt")

dataSkin = dataSkin.drop('Unnamed: 8550', axis=1)

col = pd.Categorical(label["clinical_diagnosis"])
label["clinical_diagnosis"] = col.codes
#label = label-1
label = label/2

label[label<1]  = 0

In [0]:
label.head(200)

**Split data into train and test**

In [0]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(dataSkin, 
                                                    label,
                                                    test_size=0.20, 
                                                    random_state=42)

In [0]:
# global variables
seed = 42
num_folds = 10
scoring = {'Accuracy': make_scorer(accuracy_score)}

**Training using a Pipeline and Gridsearch**

In [0]:
# A single Pipeline
pipe = Pipeline(steps = [("clf",MLPClassifier())])

# create a dictionary with the hyperparameters
search_space = [
                {"clf":[MLPClassifier()],
                 "clf__hidden_layer_sizes": [(120,240),(120,480,120),(480,800,480)],
                 "clf__activation": ["logistic","relu"],
                 "clf__solver": ["sgd"],
                 "clf__max_iter": [50000],
                 "clf__early_stopping":[True],
                 "clf__n_iter_no_change":[20],
                 "clf__validation_fraction":[0.20], 
                 }
                ]

# create grid search
kfold = StratifiedKFold(n_splits=num_folds,random_state=seed)

# return_train_score=True
# official documentation: "computing the scores on the training set can be
# computationally expensive and is not strictly required to
# select the parameters that yield the best generalization performance".
grid = GridSearchCV(estimator=pipe, 
                    param_grid=search_space,
                    cv=kfold,
                    scoring=scoring,
                    return_train_score=True,
                    n_jobs=-1,
                    refit="Accuracy")

tmp = time.time()

# fit grid search
best_model = grid.fit(X_train,y_train)

print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))

In [0]:
print("Best: %f using %s" % (best_model.best_score_,best_model.best_params_))

In [0]:
result = pd.DataFrame(best_model.cv_results_)

In [0]:
result_acc = result[['mean_train_Accuracy', 'std_train_Accuracy',
                     'mean_test_Accuracy', 'std_test_Accuracy','rank_test_Accuracy',"param_clf__hidden_layer_sizes"]].copy()
result_acc["std_ratio"] = result_acc.std_test_Accuracy/result_acc.std_train_Accuracy
result_acc.sort_values(by="rank_test_Accuracy",ascending=True)

In [0]:
# best model
predict = best_model.best_estimator_.predict(X_test)
print(accuracy_score(y_test, predict))