# Binary Classification

## Load Dataset from sklearn

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [9]:
type(cancer)

sklearn.utils._bunch.Bunch

In [7]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target

df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [15]:
for i, c in enumerate(list(df.columns)):
    print(i, c)

0 mean radius
1 mean texture
2 mean perimeter
3 mean area
4 mean smoothness
5 mean compactness
6 mean concavity
7 mean concave points
8 mean symmetry
9 mean fractal dimension
10 radius error
11 texture error
12 perimeter error
13 area error
14 smoothness error
15 compactness error
16 concavity error
17 concave points error
18 symmetry error
19 fractal dimension error
20 worst radius
21 worst texture
22 worst perimeter
23 worst area
24 worst smoothness
25 worst compactness
26 worst concavity
27 worst concave points
28 worst symmetry
29 worst fractal dimension
30 class


## Convert to PyTorch Tensor

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [11]:
data = torch.from_numpy(df.values).float() #torch.from_numpy(data) 까지만 하면 data type이 double이 된다. float()로 변환을 하면 메모리 효율을 올릴 수 있겠죠?
data.shape

torch.Size([569, 31])

In [16]:
x = data[:, :10]
y = data[:, -1:]

print(x.shape, y.shape)

torch.Size([569, 10]) torch.Size([569, 1])


In [17]:
# Train / Valid / Test
ratio = [.6, .2, .2]

In [19]:
train_cnt = int(data.size(0) * ratio[0])
valid_cnt = int(data.size(0) * ratio[1])
test_cnt = int(data.size(0)) - train_cnt - valid_cnt
cnts = [train_cnt, valid_cnt, test_cnt]

print(train_cnt, valid_cnt, test_cnt)

341 113 115


In [20]:
indices =  torch.randperm(data.size(0))

x = torch.index_select(x, dim=0, index=indices)
y = torch.index_select(y, dim=0, index=indices)

x = x.split(cnts, dim=0)
y = y.split(cnts, dim=0)

for x_i, y_i in zip(x, y):
    print(x_i.size(), y_i.size())

torch.Size([341, 10]) torch.Size([341, 1])
torch.Size([113, 10]) torch.Size([113, 1])
torch.Size([115, 10]) torch.Size([115, 1])


### You don't need to shuffle them, if you have fixed train, valid, test set.
### e.g) MNIST

# Set Hyper Parameters

In [21]:
n_epochs = 10000
batch_size = 128
print_interval = 500
early_stop = 100

## Get DataLoaders

In [22]:
from torch.utils.data import Dataset, DataLoader

In [24]:
class CustomDataset(Dataset):
    # 3개의 함수만 over ride 해도 충분히 쓸 수 있다.
    
    def __init__(self, data, labels):
        # 데이터 읽어오기
        self.data = data
        self.labels = labels
        
        super().__init__()
    
    def __len__(self):
        # 데이터 크기 반환
        return len(self.data)
    
    def __getitem__(self, idx):
        # 전처리 및 minibatch를 위한 sample을 반환함
        # 실제 미니배치가 만들어지는 부분
        return self.data[idx], self.labels[idx]

In [25]:
# DataLoader 실 사용

train_loader = DataLoader(
    dataset = CustomDataset(x[0], y[0]),
    batch_size=batch_size,
    shuffle=True # Allow shuffling only for training set. # 여기는 꼭 True!
)

valid_loader = DataLoader(
    dataset = CustomDataset(x[1], y[1]),
    batch_size = batch_size,
    shuffle=False, # 여긴 왜 False? 
)

test_loader = DataLoader(
    dataset = CustomDataset(x[2], y[2]),
    batch_size=batch_size,
    shuffle=False, # 검증을 진행할 때 같은 테스트를 볼 수 있다는 부분에서 False를 추천
)

print("Train %d / Valid %d / Test %d sample." %(
    len(train_loader.dataset),
    len(valid_loader.dataset),
    len(test_loader.dataset),
))

Train 341 / Valid 113 / Test 115 sample.


# Build Model & Optimizer

In [26]:
model = nn.Sequential(
    nn.Linear(x[0].size(-1), 6),
    nn.LeakyReLU(),
    nn.Linear(6, 5),
    nn.LeakyReLU(),
    nn.Linear(5, 4),
    nn.LeakyReLU(),
    nn.Linear(4, 3),
    nn.LeakyReLU(),
    nn.Linear(3, y[0].size(-1)),
    nn.Sigmoid(),
)

model
    

Sequential(
  (0): Linear(in_features=10, out_features=6, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=6, out_features=5, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=5, out_features=4, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=4, out_features=3, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=3, out_features=1, bias=True)
  (9): Sigmoid()
)

In [27]:
optimizer = optim.Adam(model.parameters())

# Train

In [28]:
from copy import deepcopy

lowest_loss = np.inf
best_model = None
lowest_epoch = np.inf

In [31]:
train_history, valid_history = [], []

for i in range(n_epochs):
    model.train() 
    
    # We can remove following code block
    # --------------------------------------------------
    # indices = torch.randperm(x[0].size(0))
    # x_ = torch.index_select(x[0], dim=0, index=indices)
    # y_ = torch.index_select(y[0], dim=0, index=indices)
    #
    # x_ = x_.split(batch_size, dim=0)
    # y_ = y_.split(batch_size, dim=0)
    # --------------------------------------------------
    
    train_loss, valid_loss = 0, 0
    y_hat = []
    
    for x_i, y_i in train_loader: # 과거에는 enumerate(x_, y_)를 했다면 이제는 train_loader만 하면 됨. 자동으로 mini batch를 만들어주기 때문이겠지.
        y_hat_i = model(x_i)
        loss = F.binary_cross_entropy(y_hat_i, y_i)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        train_loss += float(loss) # This is very important to prevent memory leak.
        
    train_loss = train_loss / len(train_loader)
    
    model.eval()
    with torch.no_grad():
        valid_loss = 0
        
        for x_i, y_i in valid_loader:
            y_hat_i = model(x_i)
            loss = F.binary_cross_entropy(y_hat_i, y_i)
            
            valid_loss += float(loss)
            
            y_hat += [y_hat_i]
            
    valid_loss = valid_loss / len(valid_loader)
    
    train_history += [train_loss]
    valid_history += [valid_loss]
    
    if (i+1) % print_interval==0:
        print(f"{i+1}: {train_loss} / {valid_loss} ")

500: 0.25979559620221454 / 0.2952679693698883 
1000: 0.22496080895264944 / 0.2588886618614197 
1500: 0.16895892471075058 / 0.20854058861732483 
2000: 0.1526962568362554 / 0.19410179555416107 
2500: 0.1566708485285441 / 0.19755379855632782 
3000: 0.15390855570634207 / 0.2000281810760498 
3500: 0.1564700777331988 / 0.19717812538146973 
4000: 0.15536450843016306 / 0.19199088215827942 
4500: 0.1530818591515223 / 0.19060355424880981 
5000: 0.14586085081100464 / 0.19241183996200562 
5500: 0.1469742183883985 / 0.19196219742298126 
6000: 0.14956123133500418 / 0.1992666870355606 
6500: 0.1642073392868042 / 0.18533983826637268 
7000: 0.1514857957760493 / 0.19731342792510986 
7500: 0.15329907337824503 / 0.2029128223657608 
8000: 0.14716694255669913 / 0.19569358229637146 
8500: 0.14518898725509644 / 0.18503481149673462 
9000: 0.1545379931728045 / 0.1821019947528839 
9500: 0.14288120716810226 / 0.18191632628440857 
10000: 0.14364910622437796 / 0.17703023552894592 
