In [173]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import manifold

# If you'd like to try this lab with PCA instead of Isomap as the dimensionality reduction technique:
Test_PCA = False


def plotDecisionBoundary(model, X, y):
  print "Plotting..."
  import matplotlib.pyplot as plt
  import matplotlib
  matplotlib.style.use('ggplot') # Look Pretty

  fig = plt.figure()
  ax = fig.add_subplot(111)

  padding = 0.1
  resolution = 0.1

  from matplotlib.colors import ListedColormap
  cmap_light = ListedColormap(['#AAFFAA', '#AAAAFF'])
  cmap_bold  = ListedColormap(['#00AA00', '#0000AA'])
  
  # Calculate the boundaries
  x_min, x_max = X[:, 0].min(), X[:, 0].max()
  y_min, y_max = X[:, 1].min(), X[:, 1].max()
  x_range = x_max - x_min
  y_range = y_max - y_min
  x_min -= x_range * padding
  y_min -= y_range * padding
  x_max += x_range * padding
  y_max += y_range * padding

  # Create a 2D Grid Matrix. The values stored in the matrix are the predictions of the class at said location
  import numpy as np
  xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
                       np.arange(y_min, y_max, resolution))

  # What class does the classifier say?
  Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)

  # Plot the contour map
  plt.contourf(xx, yy, Z, cmap=cmap_light)
  plt.axis('tight')

  # Plot our original points as well...
  for label in np.unique(y):
    #label: (2 for benign, 4 for malignant)
    c = 0 if label==2 else 1
    indices = np.where(y == label)
    plt.scatter(X[indices, 0], X[indices, 1], c=cmap_bold(c), alpha=0.8)

  p = model.get_params()
  plt.title('K = ' + str(p['n_neighbors']))
  plt.show()


In [174]:
# Load in the dataset into a variable 'X'
X = pd.read_csv('breast_cancer.csv', names = ['sample', 'thickness', 'size', 'shape', 'adhesion', 'epithelial', 'nuclei', 'chromatin', 'nucleoli', 'mitoses', 'status'])
X.head()

Unnamed: 0,sample,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,status
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [175]:
X.dtypes

sample         int64
thickness      int64
size           int64
shape          int64
adhesion       int64
epithelial     int64
nuclei        object
chromatin      int64
nucleoli       int64
mitoses        int64
status         int64
dtype: object

In [176]:
X.nuclei = pd.to_numeric(X.nuclei, errors = 'coerce')
X.isnull().sum()

sample         0
thickness      0
size           0
shape          0
adhesion       0
epithelial     0
nuclei        16
chromatin      0
nucleoli       0
mitoses        0
status         0
dtype: int64

In [177]:
X.nuclei.unique()

array([  1.,  10.,   2.,   4.,   3.,   9.,   7.,  nan,   5.,   8.,   6.])

In [178]:
X = X.fillna(method = 'backfill', axis = 1)

In [179]:
# TODO: Copy out the status column into a slice, then drop it from the main dataframe. You can also drop the sample column, since that
# doesn't provide us with any machine learning power.
y = X.status
X.drop(labels = ['sample','status'], axis = 1, inplace = True)

In [180]:
# TODO: Experiment with the basic SKLearn preprocessing scalers. We know that the features consist of different units mixed
# in together, so it's reasonable to assume feature scaling is necessary. Print out a description of the dataset, post transformation.
#

from sklearn import preprocessing
T = preprocessing.RobustScaler(X)

In [181]:
model = None
if Test_PCA:
  print "Computing 2D Principal Components"
  #
  # TODO: Implement PCA here. save your model into the variable 'model1'.
  # You should reduce down to two dimensions.
  #
  # .. your code here ..
  model = PCA(n_components = 2)   

else:
  print "Computing 2D Isomap Manifold"
  #
  # TODO: Implement Isomap here. save your model into the variable 'model1'
  # Experiment with K values from 5-10.
  # You should reduce down to two dimensions.
  #
  # .. your code here ..
  model = manifold.Isomap(n_neighbors = 4, n_components = 2)

Computing 2D Isomap Manifold


In [182]:
from sklearn.cross_validation import train_test_split
data_train, data_test, label_train, label_test = train_test_split(X, y, test_size = 0.33, random_state = 7)


In [183]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 4, weights = 'distance')
knn.fit(data_train, label_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance')

In [184]:
print knn.score(data_test, label_test)

0.95670995671


In [185]:
plotDecisionBoundary(knn, data_test, label_test)


Plotting...


TypeError: unhashable type