# Objective: Linear Classification for Breast Cancer
Here, I'm going to build a linear classification model using PyTorch to distinguish positive and negative cases in terms of breast cancer.

## Import Libraries

In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

## Load Data
This covers how to get the data and load the data into the code. Then, analyze the data to figure out how to build a model out of the data.

In [2]:
from sklearn.datasets import load_breast_cancer

# load the data
data = load_breast_cancer()

# check the type of data
type(data)

sklearn.utils._bunch.Bunch

In [3]:
# analyze the data: Bunch object 
# Bunch object is a sort of dictionary where you can treat the keys like attributes
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
# input data X
data.data.shape

(569, 30)

In [5]:
# target y
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [6]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [7]:
data.target.shape

(569,)

In [8]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

## Preprocess Data
This covers how to split training data set and test data set. Then this shows how to normalize the data.

In [9]:
# import library for spliting
from sklearn.model_selection import train_test_split

# split the data into training and test sets
# training data: 70%
# test data: 30%
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3)
N, D = X_train.shape

In [10]:
# import library for normalization
from sklearn.preprocessing import StandardScaler

# normalize the data for the model
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# more detail about the differece between fit_transform vs transform
# ref: https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe

In [11]:
# convert data into torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32).reshape(-1, 1))
y_test = torch.from_numpy(y_test.astype(np.float32).reshape(-1, 1))

## Build Model
This covers how to build a linear classification model to distinguish positive and negative cases of breast cancer with a choice of loss function and optimizer.

In [12]:
# build the model
model = nn.Sequential(
    nn.Linear(D, 1),
    nn.Sigmoid()
)
# activation function: sigmoid

In [13]:
# loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

## Training
This covers how to train the linear classification model with the input data that we normalized.

In [14]:
# the number of epochs
epochs = 1000

# losses will be stored to plot the results at the end
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)

for epoch in range(epochs):
    # zero the parameter gradients to make sure they are zero
    optimizer.zero_grad()
    
    # forward pass
    outputs_train = model(X_train)
    loss_train = criterion(outputs_train, y_train)
    
    # backward and optimize
    loss_train.backward()
    optimizer.step()
    
    # get test loss
    outputs_test = model(X_test)
    loss_test = criterion(outputs_test, y_test)
    
    # save the losses
    train_losses[epoch] = loss_train.item()
    test_losses[epoch] = loss_test.item()
    
    # print out the progress
    if (epoch + 1) % 50 == 0:
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {loss_train.item():.4f}, Test Loss: {loss_test.item():.4f}')

Epoch 50/1000, Train Loss: 0.3513, Test Loss: 0.3890
Epoch 100/1000, Train Loss: 0.2611, Test Loss: 0.3009
Epoch 150/1000, Train Loss: 0.2117, Test Loss: 0.2559
Epoch 200/1000, Train Loss: 0.1798, Test Loss: 0.2282
Epoch 250/1000, Train Loss: 0.1575, Test Loss: 0.2092
Epoch 300/1000, Train Loss: 0.1410, Test Loss: 0.1953
Epoch 350/1000, Train Loss: 0.1284, Test Loss: 0.1848
Epoch 400/1000, Train Loss: 0.1185, Test Loss: 0.1765
Epoch 450/1000, Train Loss: 0.1105, Test Loss: 0.1698
Epoch 500/1000, Train Loss: 0.1038, Test Loss: 0.1643
Epoch 550/1000, Train Loss: 0.0982, Test Loss: 0.1596
Epoch 600/1000, Train Loss: 0.0934, Test Loss: 0.1557
Epoch 650/1000, Train Loss: 0.0893, Test Loss: 0.1523
Epoch 700/1000, Train Loss: 0.0856, Test Loss: 0.1494
Epoch 750/1000, Train Loss: 0.0824, Test Loss: 0.1468
Epoch 800/1000, Train Loss: 0.0795, Test Loss: 0.1445
Epoch 850/1000, Train Loss: 0.0769, Test Loss: 0.1424
Epoch 900/1000, Train Loss: 0.0745, Test Loss: 0.1406
Epoch 950/1000, Train Loss: 0