# Loading the data


In [45]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, model_selection, neighbors, svm
import warnings
warnings.filterwarnings('ignore')

### 7. Attribute Information: (class attribute has been moved to last column)

   ###  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

### 8. Missing attribute values: 16

   There are 16 instances in Groups 1 to 6 that contain a single missing 
   (i.e., unavailable) attribute value, now denoted by "?".  

### 9. Class distribution:
 
   Benign: 458 (65.5%)
   Malignant: 241 (34.5%)

In [46]:
df = pd.read_csv('data/breast-cancer-wisconsin.data')
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_ephith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Cleaning the data

In [47]:
df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)

In [48]:
x = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

# Spliting the Train data and Test data

In [49]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2)

In [50]:
x_train

array([[1, 1, 1, ..., 2, 1, 1],
       [8, 10, 10, ..., 7, 8, 1],
       [5, 1, 2, ..., 3, 1, 1],
       ...,
       [4, 1, 1, ..., 2, 1, 1],
       [10, 10, 10, ..., 7, 10, 1],
       [4, 10, 4, ..., 9, 10, 1]], dtype=object)

In [51]:
y_train

array([2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       4, 2, 4, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2,
       4, 2, 2, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2, 2,
       2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2,
       4, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 2, 2, 2, 2, 2,
       4, 2, 2, 2, 4, 2, 4, 2, 4, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2,
       4, 4, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 2, 4,
       2, 4, 2, 4, 2, 4, 2, 2, 4, 2, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2,
       4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2,

# Creating and training the classifier

In [52]:
clf = svm.SVC()
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

# Checking model accuracy

In [53]:
accuracy = clf.score(x_test, y_test)
accuracy

0.9714285714285714

# Tests

In [54]:
# Generating test data

In [55]:
example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [4, 2, 1, 2, 2, 2, 3, 2, 1]])
example_measures = example_measures.reshape(len(example_measures), -1)

In [56]:
tipos_de_cancer = {4 : "maligo", 2 : "benigno"}

In [57]:
prediction = clf.predict(example_measures)
print('Tipo de cancer é: {}'.format(tipos_de_cancer[prediction.item(0)]))

Tipo de cancer é: benigno
