In [2]:
import torch
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [3]:
X, y = make_classification(n_samples=10,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_classes=2,
                           random_state=42
                           )

In [4]:
X

array([[ 1.06833894, -0.97007347],
       [-1.14021544, -0.83879234],
       [-2.8953973 ,  1.97686236],
       [-0.72063436, -0.96059253],
       [-1.96287438, -0.99225135],
       [-0.9382051 , -0.54304815],
       [ 1.72725924, -1.18582677],
       [ 1.77736657,  1.51157598],
       [ 1.89969252,  0.83444483],
       [-0.58723065, -1.97171753]])

In [5]:
y

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [6]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

In [7]:
y 

tensor([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [8]:
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [9]:
# create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [10]:
dataset = CustomDataset(X, y)

In [11]:
len(dataset)

10

In [12]:
dataset[1]

(tensor([-1.1402, -0.8388]), tensor(0))

In [13]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [14]:
for batch in dataloader:
    features, labels = batch
    print(features)
    print(labels)
    print('**'*25)

tensor([[-0.9382, -0.5430],
        [-0.7206, -0.9606]])
tensor([1, 0])
**************************************************
tensor([[-1.1402, -0.8388],
        [ 1.7774,  1.5116]])
tensor([0, 1])
**************************************************
tensor([[-2.8954,  1.9769],
        [-0.5872, -1.9717]])
tensor([0, 0])
**************************************************
tensor([[ 1.7273, -1.1858],
        [ 1.0683, -0.9701]])
tensor([1, 1])
**************************************************
tensor([[ 1.8997,  0.8344],
        [-1.9629, -0.9923]])
tensor([1, 0])
**************************************************


In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [18]:
df.drop(columns=['Unnamed: 32', 'id'], inplace=True, axis=1)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [19]:
x_features = df.drop(columns=['diagnosis'], axis=1)
y_labels = df['diagnosis']

In [20]:
x_features

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_labels, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

In [27]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [28]:
x_train = torch.tensor(x_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [29]:
train_dataset = CustomDataset(x_train, y_train)
test_dataset = CustomDataset(x_test, y_test)

In [30]:
train_dataset[1]

(tensor([ 1.9741,  1.7330,  2.0917,  1.8520,  1.3198,  3.4263,  2.0131,  2.6650,
          2.1270,  1.5584,  0.8053, -0.8127,  0.7520,  0.8772, -0.8961,  1.1812,
          0.1836,  0.6006, -0.3177,  0.5296,  2.1733,  1.3113,  2.0816,  2.1374,
          0.7619,  3.2656,  1.9286,  2.6989,  1.8912,  2.4978]),
 tensor(1))

In [51]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [32]:
import torch.nn as nn
import torch.nn.functional as F

In [33]:
class SimpleFCN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.sigmoid(x)
        return x 

In [42]:
learning_rate = 0.001
epochs = 100
model = SimpleFCN(x_train.shape[1], 8, 1)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss = nn.BCELoss()

In [47]:
for epoch in range(epochs):
    for batch in train_dataloader:
        features, labels = batch
        optimizer.zero_grad()
        outputs = model(features)
        labels = labels.view(-1, 1)
        #print(features.shape, labels.shape, outputs.shape)
        loss_value = loss(outputs, labels.float())
        loss_value.backward()
        optimizer.step()
    print(f'epoch: {epoch}, loss: {loss_value.item()}')

epoch: 0, loss: 0.578216016292572
epoch: 1, loss: 0.5486416816711426
epoch: 2, loss: 0.5173016786575317
epoch: 3, loss: 0.3498241603374481
epoch: 4, loss: 0.49995702505111694
epoch: 5, loss: 0.27804359793663025
epoch: 6, loss: 0.18033120036125183
epoch: 7, loss: 0.15629258751869202
epoch: 8, loss: 0.2248399257659912
epoch: 9, loss: 0.21893854439258575
epoch: 10, loss: 0.0732455849647522
epoch: 11, loss: 0.1600208282470703
epoch: 12, loss: 0.1062139943242073
epoch: 13, loss: 0.11520258337259293
epoch: 14, loss: 0.091596819460392
epoch: 15, loss: 0.13797733187675476
epoch: 16, loss: 0.08190875500440598
epoch: 17, loss: 0.18442018330097198
epoch: 18, loss: 0.15048663318157196
epoch: 19, loss: 0.07459396868944168
epoch: 20, loss: 0.09932347387075424
epoch: 21, loss: 0.022265080362558365
epoch: 22, loss: 0.4856056272983551
epoch: 23, loss: 0.09899728000164032
epoch: 24, loss: 0.03959667310118675
epoch: 25, loss: 0.017285959795117378
epoch: 26, loss: 0.07125965505838394
epoch: 27, loss: 0.02

In [57]:
# evaluate the model
model.eval()
acc_list = []
tot_corr = 0
with torch.no_grad():
    for batch_features, batch_labels in test_dataloader:
        batch_pred = model(batch_features)
        batch_pred = (batch_pred > 0.5).float()
        batch_pred = batch_pred.view(-1)
        #print(y_pred.shape, y_test.shape)
        batch_acc = (batch_pred == batch_labels).sum().item()
        #print(batch_acc)
        tot_corr += batch_acc
    
print(f'Accuracy: {(1.0 * tot_corr) / len(test_dataset):.4f}')


Accuracy: 0.9825
