In [1]:
!pip install tensorflow-gpu==2.0.0-rc1 --quiet

[K     |████████████████████████████████| 380.5MB 44kB/s 
[K     |████████████████████████████████| 51kB 7.9MB/s 
[K     |████████████████████████████████| 4.3MB 44.2MB/s 
[K     |████████████████████████████████| 501kB 56.4MB/s 
[?25h

## Keras and TensorFlow

In this assignment, we will learn about Keras and TensorFlow. We will create a neural network and measure the model's performance.

In [2]:
import numpy as np
import pandas as pd


In [3]:
cancer = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv')

In [4]:
print(cancer.info)
cancer.head()

<bound method DataFrame.info of            id diagnosis  ...  symmetry_worst  fractal_dimension_worst
0      842302         M  ...          0.4601                  0.11890
1      842517         M  ...          0.2750                  0.08902
2    84300903         M  ...          0.3613                  0.08758
3    84348301         M  ...          0.6638                  0.17300
4    84358402         M  ...          0.2364                  0.07678
..        ...       ...  ...             ...                      ...
564    926424         M  ...          0.2060                  0.07115
565    926682         M  ...          0.2572                  0.06637
566    926954         M  ...          0.2218                  0.07820
567    927241         M  ...          0.4087                  0.12400
568     92751         B  ...          0.2871                  0.07039

[569 rows x 32 columns]>


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


We'll start with some data cleaning. First remove the id column since we have no use for it. Then check for missing data and remove all rows containing missing data.

In [5]:
# Answer below:
cancer.drop(columns='id', inplace=True)
cancer.dropna(inplace=True)
cancer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

The diagnosis columnn is our target variable. How many possible values are in this variable?

In [6]:
# Answer below:
nb_classes = cancer.diagnosis.value_counts().count()
nb_classes
# Two possible values for the diagnosis variables 'B': Benign or 'M': Malignant

2

In [7]:
diag = cancer.diagnosis.values

Create one or multiple dummy variables for this column and store this encoded data in a numpy array called `target`

In [8]:
# Answer below:
from sklearn.preprocessing import OneHotEncoder

target = OneHotEncoder(sparse=False).fit_transform(diag.reshape(-1,1))

When looking at the columm names, you may have noticed that for each attribute we have a mean of the attribute, a standard error, and a worst value. Let's create a list of column names for only the means columns. Use this list to create a new dataframe containing only the means columns.

In [9]:
cancer.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [10]:
# Answer below:
cols = ['radius_mean','texture_mean','perimeter_mean','area_mean',
        'smoothness_mean','compactness_mean','concavity_mean', 
        'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
df = cancer[cols]

Split the data into train and test samples. The test sample should contain 20% of the data.

In [11]:
# Answer below
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=.2)


In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(455, 10)
(114, 10)
(455, 2)
(114, 2)


Now we'll use the minmax scaler to ensure that all columns have values between zero and 1. Apply this transformations for all columns to be used in the prediction.

In [13]:
# Answer below:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
xtrain_scale = scaler.fit_transform(X_train)
ytrain_scale = scaler.fit_transform(y_train)
xtest_scale = scaler.fit_transform(X_test)
ytest_scale = scaler.fit_transform(y_test)

Now we can proceed with our model. In the cell below, import the `Sequential`, and `Dense` functions.

In [14]:
# Answer below:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


create a model that takes the training data and predicts the target variable. The model will have 3 layers. The input layer should be of unit size 12 and the hidden layer will be of size 10. The output layer size will be determined by the size of the target variable. Use the sigmoid activation function for the last layer

In [15]:
# Answer below
model = Sequential()
model.add(Dense(132, input_dim=xtrain_scale.shape[1], activation='relu'))
model.add(Dense(130, activation='relu'))
model.add(Dense(nb_classes, activation='sigmoid'))


In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 132)               1452      
_________________________________________________________________
dense_1 (Dense)              (None, 130)               17290     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 262       
Total params: 19,004
Trainable params: 19,004
Non-trainable params: 0
_________________________________________________________________


Compile and fit the model. Use the adam optimizer and the binary crossentropy loss function. Train the model for 200 epochs with a batch size of 80. Make sure touse the test data into the model as well.

In [17]:
# Answer below:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

his = model.fit(xtrain_scale, ytrain_scale, 
                validation_data=(xtest_scale, ytest_scale), 
                batch_size=80, epochs=200, verbose=1)

Train on 455 samples, validate on 114 samples
Epoch 1/200
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/20

Report the accuracy of the test sample.

In [18]:
scoredf = pd.DataFrame(his.history)
#scoredf.head()

In [19]:
# Answer below:
score = model.evaluate(xtest_scale, ytest_scale, verbose=0)
print('Test loss     : ', score[0])
print('Test accuracy : ', score[1])


Test loss     :  0.10877920412703564
Test accuracy :  0.95614034


In [20]:
print('Avg. Test loss     : ', scoredf.val_loss.mean())
print('Avg. Test accuracy : ', scoredf.val_accuracy.mean())


Avg. Test loss     :  0.13525318755327087
Avg. Test accuracy :  0.9518640345335007
