### Loading Dataset and Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
df.shape

(569, 33)

In [4]:
df.drop(columns=['Unnamed: 32','id'],inplace=True)

In [5]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [7]:
df['diagnosis'].value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
B,357
M,212


### Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['diagnosis']), df['diagnosis'], test_size=0.2, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

### Scaling

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
X_train

array([[-1.44075296, -0.43531947, -1.36208497, ...,  0.9320124 ,
         2.09724217,  1.88645014],
       [ 1.97409619,  1.73302577,  2.09167167, ...,  2.6989469 ,
         1.89116053,  2.49783848],
       [-1.39998202, -1.24962228, -1.34520926, ..., -0.97023893,
         0.59760192,  0.0578942 ],
       ...,
       [ 0.04880192, -0.55500086, -0.06512547, ..., -1.23903365,
        -0.70863864, -1.27145475],
       [-0.03896885,  0.10207345, -0.03137406, ...,  1.05001236,
         0.43432185,  1.21336207],
       [-0.54860557,  0.31327591, -0.60350155, ..., -0.61102866,
        -0.3345212 , -0.84628745]])

In [12]:
y_train.head()

Unnamed: 0,diagnosis
68,B
181,M
63,B
248,B
60,B


### Label Encoding

In [13]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [14]:
y_test

array([0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1])

### Numpy arrays to PyTorch tensors

In [15]:
X_train_tensor = torch.from_numpy(X_train).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_train_tensor = torch.from_numpy(y_train).float()
y_test_tensor = torch.from_numpy(y_test).float()

In [16]:
X_train_tensor.shape, X_test_tensor.shape, y_train_tensor.shape, y_test_tensor.shape

(torch.Size([455, 30]),
 torch.Size([114, 30]),
 torch.Size([455]),
 torch.Size([114]))

In [17]:
X_train_tensor.dtype, X_test_tensor.dtype, y_train_tensor.dtype, y_test_tensor.dtype

(torch.float32, torch.float32, torch.float32, torch.float32)

### Loading Dataset

In [18]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):

  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

In [19]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [20]:
train_dataset[5], test_dataset[5]

((tensor([ 0.1196,  1.9607,  0.1995,  0.0125,  1.3055,  1.0662,  0.9448,  0.6670,
           1.7919,  1.1127, -0.1132, -0.3136,  0.0100, -0.1594, -0.4526,  0.9020,
           0.4543, -0.1581, -0.2454,  0.5911,  0.2549,  1.9137,  0.5098,  0.1168,
           1.5730,  2.6153,  2.0462,  0.8738,  2.0814,  2.8176]),
  tensor(1.)),
 (tensor([ 1.8354,  2.3807,  1.9847,  1.7222,  1.5859,  3.3080,  3.3069,  2.7282,
           2.1343,  1.0280,  1.1454,  0.7250,  1.4083,  0.9781, -0.1530,  1.9339,
           1.1942,  0.7549,  0.3267,  0.8493,  1.9777,  2.2917,  2.3246,  1.6648,
           1.4515,  3.9741,  3.1750,  2.3112,  1.8737,  2.2510]),
  tensor(1.)))

In [21]:
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle=False)

In [22]:
batch_features, batch_labels = next(iter(train_loader))
print(batch_features.shape, batch_labels.shape)

torch.Size([32, 30]) torch.Size([32])


### Defining the model

In [23]:
import torch.nn as nn

class MySimpleNN(nn.Module):

  def __init__(self, num_features):

    super().__init__()
    self.linear = nn.Linear(num_features, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, features):
    out = self.linear(features)
    out = self.sigmoid(out)
    return out

## Important Parameters

In [24]:
learning_rate = 0.1
epochs = 25

In [25]:
loss_fn = nn.BCELoss()

### Training Pipeline

In [26]:
# create model
model = MySimpleNN(X_train_tensor.shape[1])

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# define loop

for epoch in range(epochs):

  for batch_features, batch_labels in train_loader:

    # forward pass
    y_pred = model(batch_features)

    # loss
    loss = loss_fn(y_pred, batch_labels.view(-1, 1))

    # clear gradients
    optimizer.zero_grad()

    # backward pass
    loss.backward()

    # parameters update
    optimizer.step()

    # print loss in each epoch
  print(f"Epoch: {epoch + 1}, Loss:{loss.item()}")

Epoch: 1, Loss:0.7804697751998901
Epoch: 2, Loss:0.06123194471001625
Epoch: 3, Loss:0.23492014408111572
Epoch: 4, Loss:0.0501449778676033
Epoch: 5, Loss:0.16357077658176422
Epoch: 6, Loss:0.0863676443696022
Epoch: 7, Loss:0.2893648147583008
Epoch: 8, Loss:0.09392312914133072
Epoch: 9, Loss:0.15618279576301575
Epoch: 10, Loss:0.2528960108757019
Epoch: 11, Loss:0.11353065818548203
Epoch: 12, Loss:0.026080088689923286
Epoch: 13, Loss:0.048504557460546494
Epoch: 14, Loss:0.0743228867650032
Epoch: 15, Loss:0.06906711310148239
Epoch: 16, Loss:0.1129840537905693
Epoch: 17, Loss:0.08335159718990326
Epoch: 18, Loss:0.04252905771136284
Epoch: 19, Loss:0.2751249074935913
Epoch: 20, Loss:0.054592031985521317
Epoch: 21, Loss:0.07987141609191895
Epoch: 22, Loss:0.2837713658809662
Epoch: 23, Loss:0.05731235817074776
Epoch: 24, Loss:0.3363071382045746
Epoch: 25, Loss:0.06824507564306259


In [27]:
!pip install torchinfo
from torchinfo import summary

summary(model, input_size=(1, 30))



Layer (type:depth-idx)                   Output Shape              Param #
MySimpleNN                               [1, 1]                    --
├─Linear: 1-1                            [1, 1]                    31
├─Sigmoid: 1-2                           [1, 1]                    --
Total params: 31
Trainable params: 31
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [28]:
model.linear.weight

Parameter containing:
tensor([[ 0.4782,  0.6721,  0.5627,  0.6198,  0.0638,  0.0698,  0.3438,  0.4695,
         -0.0043, -0.1674,  0.5165, -0.0151,  0.4144,  0.6474,  0.1220, -0.4394,
         -0.1652,  0.1998, -0.1390, -0.4100,  0.6532,  0.6825,  0.6914,  0.5305,
          0.5864,  0.2771,  0.4418,  0.4509,  0.5959,  0.2042]],
       requires_grad=True)

In [29]:
model.linear.bias

Parameter containing:
tensor([-0.4622], requires_grad=True)

### Evaluation

In [30]:
# model evaluation using test_loader
model.eval()
accuracy_list = []

with torch.no_grad():
  for batch_features, batch_labels in test_loader:
      y_pred = model(batch_features)
      y_pred = (y_pred > 0.9).float()
      batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
      accuracy_list.append(batch_accuracy)

# Calculate overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f'Accuracy: {overall_accuracy:.4f}')

Accuracy: 0.9531
