In [1]:
import pandas as pd                                     # data frame
from sklearn.preprocessing   import MinMaxScaler        # normalizing our data
from sklearn.impute          import SimpleImputer       # replace NaN values with the column mean
from sklearn.model_selection import train_test_split    # splitting our data into training/testing
from sklearn.linear_model    import Perceptron          # Perceptron

1. Grab our data from the internet

In [2]:
url = 'https://github.com/vt-ai-ml/fall2019-meetings/raw/master/data/cancer.csv'
data = pd.read_csv(url)

data.drop(['Sample code'],axis = 1, inplace = True)    # drop the 'Sample code' column
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


2. Do some preprocessing
    * replacing bad values (eg. '?')
    * replacing missing values with the mean
    * scaling our data to [0, 1]

In [3]:
print(data.isin(['?']).any())
data.replace('?', 0, inplace=True)   # replace '?' with 0

Clump Thickness             False
Uniformity of Cell Size     False
Uniformity of Cell Shape    False
Marginal Adhesion           False
Single Epithelial           False
Bare Nuclei                  True
Bland Chromatin             False
Normal Nucleoli             False
Mitoses                     False
Class                       False
dtype: bool


In [4]:
# Convert the DataFrame object into NumPy array otherwise you will not be able to impute the mean
values = data.values

# replace missing values with mean
imputer = SimpleImputer()
imputedData = imputer.fit_transform(values)

# normalize our data so our values are now between [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
normalizedData = scaler.fit_transform(imputedData)

print(normalizedData[0])

[0.44444444 0.         0.         0.         0.11111111 0.1
 0.22222222 0.         0.         0.        ]


3. Prepare our data for training and save data for testing

In [5]:
# Get our input values (X) and output values (Y)
X = normalizedData[:,0:9]
Y = normalizedData[:,9]

# split data into training data & testing data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
print("# of training points:", x_train.size, "\n# of testing points:", x_test.size)

# of training points: 5346 
# of testing points: 945


4. Create our Perceptron model and test it's accuracy

In [6]:
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
print(perceptron.score(x_test, y_test))

0.9619047619047619


Let's compare to other models
* Decision Tree
* Logistic Regression
* K Nearest Neighbor

In [7]:
from sklearn.tree         import DecisionTreeClassifier    # decision trees
from sklearn.linear_model import LogisticRegression        # logistic regression
from sklearn.neighbors    import KNeighborsClassifier      # logistic regression

tree    = DecisionTreeClassifier(random_state = 0)
log_reg = LogisticRegression(solver = 'newton-cg')
knn     = KNeighborsClassifier(n_neighbors = 21)

tree.fit(x_train, y_train)
log_reg.fit(x_train, y_train)
knn.fit(x_train, y_train)

print('Decision Tree score:\t  ', tree.score(x_test, y_test))
print('Logistic Regression score:', log_reg.score(x_test, y_test))
print('K Nearest Neighbor score: ', knn.score(x_test, y_test))

Decision Tree score:	   0.9523809523809523
Logistic Regression score: 0.9714285714285714
K Nearest Neighbor score:  0.9619047619047619
