In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from prettytable import PrettyTable

## Data set

In [2]:
# Carregando o conjunto de dados breast cancer do UCI
url_data  = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"

names = ['ID', 'Diagnosis', 'Radius Mean', 'Texture Mean', 'Perimeter Mean', 
         'Area Mean', 'Smoothness Mean', 'Compactness Mean', 'Concavity Mean',
         'Concave Points Mean', 'Symmetry Mean', 'Fractal Dimension Mean',
         'Radius SE', 'Texture SE', 'Perimeter SE', 'Area SE', 'Smoothness SE',
         'Compactness SE', 'Concavity SE', 'Concave Points SE', 'Symmetry SE',
         'Fractal Dimension SE', 'Radius Worst', 'Texture Worst', 'Perimeter Worst',
         'Area Worst', 'Smoothness Worst', 'Compactness Worst', 'Concavity Worst',
         'Concave Points Worst', 'Symmetry Worst', 'Fractal Dimension Worst']

dataset = pd.read_csv(url_data, names=names)
dataset.head()

Unnamed: 0,ID,Diagnosis,Radius Mean,Texture Mean,Perimeter Mean,Area Mean,Smoothness Mean,Compactness Mean,Concavity Mean,Concave Points Mean,...,Radius Worst,Texture Worst,Perimeter Worst,Area Worst,Smoothness Worst,Compactness Worst,Concavity Worst,Concave Points Worst,Symmetry Worst,Fractal Dimension Worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Dividindo o conjunto de dados
X = dataset.iloc[:, 2:].values
y = dataset.iloc[:, 1].values

# Converter valores para numéricos
y = np.where(y == "M", 1, 0)

## Experimento com PCA

In [4]:
# Definindo PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

# Printando os Novos Componentes
pd.DataFrame(X_pca, columns=['Componente 1', 'Componente 2', 'Componente 3', 'Componente 4',
                             'Componente 5', 'Componente 6', 'Componente 7', 'Componente 8',
                             'Componente 9', 'Componente 10']).head()

Unnamed: 0,Componente 1,Componente 2,Componente 3,Componente 4,Componente 5,Componente 6,Componente 7,Componente 8,Componente 9,Componente 10
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035
2,995.793889,39.156743,-1.709753,4.19934,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994
3,-407.180803,-67.38032,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155
4,930.34118,189.340742,1.374801,8.499183,7.613289,1.02116,-0.335522,0.289109,0.036087,-0.138502


In [5]:
X_train_PCA, X_test_PCA, y_train_PCA, y_test_PCA = train_test_split(X_pca, y, test_size=0.2, random_state=0)

startPCA = time.time()

classifier = DecisionTreeClassifier()
classifier.fit(X_train_PCA, y_train_PCA)
y_pred_PCA = classifier.predict(X_test_PCA)

endPCA = time.time()

accuracy_PCA = accuracy_score(y_test_PCA, y_pred_PCA)
f1_score_PCA = f1_score(y_test_PCA, y_pred_PCA, average='macro')
tempo_PCA    = endPCA - startPCA

## Experimento sem PCA

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

start = time.time()

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

end = time.time()

accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred, average='macro')
tempo    = end - start

## Comparação

In [7]:
table = PrettyTable(['','Components','Classifier' ,'Accuracy', 'F1 Macro', 'Time(s)'])
table.add_row(['Com PCA', X_pca.shape[1], 'Decision Tree', "%.4f" % accuracy_PCA, "%.4f" % f1_score_PCA, "%.4f" % tempo_PCA])
table.add_row(['Sem PCA', X.shape[1], 'Decision Tree', "%.4f" % accuracy, "%.4f" % f1_score, "%.4f" % tempo])
print(table)

+---------+------------+---------------+----------+----------+---------+
|         | Components |   Classifier  | Accuracy | F1 Macro | Time(s) |
+---------+------------+---------------+----------+----------+---------+
| Com PCA |     10     | Decision Tree |  0.9474  |  0.9457  |  0.0030 |
| Sem PCA |     30     | Decision Tree |  0.9035  |  0.9018  |  0.0080 |
+---------+------------+---------------+----------+----------+---------+
