In [1]:
# creating batches to use it for training
total_data = 3200
batch_size = 32
epochs = 10

for epoch in range(epochs):
  print(f"\n-----------------------\nepoch-{epoch + 1} started")
  for strt_idx in range(0, total_data, batch_size):
    end_idx = strt_idx + batch_size

    print(f"data({strt_idx},{end_idx})")
  print(f"epoch-{epoch + 1} started\n-----------------------\n")


-----------------------
epoch-1 started
data(0,32)
data(32,64)
data(64,96)
data(96,128)
data(128,160)
data(160,192)
data(192,224)
data(224,256)
data(256,288)
data(288,320)
data(320,352)
data(352,384)
data(384,416)
data(416,448)
data(448,480)
data(480,512)
data(512,544)
data(544,576)
data(576,608)
data(608,640)
data(640,672)
data(672,704)
data(704,736)
data(736,768)
data(768,800)
data(800,832)
data(832,864)
data(864,896)
data(896,928)
data(928,960)
data(960,992)
data(992,1024)
data(1024,1056)
data(1056,1088)
data(1088,1120)
data(1120,1152)
data(1152,1184)
data(1184,1216)
data(1216,1248)
data(1248,1280)
data(1280,1312)
data(1312,1344)
data(1344,1376)
data(1376,1408)
data(1408,1440)
data(1440,1472)
data(1472,1504)
data(1504,1536)
data(1536,1568)
data(1568,1600)
data(1600,1632)
data(1632,1664)
data(1664,1696)
data(1696,1728)
data(1728,1760)
data(1760,1792)
data(1792,1824)
data(1824,1856)
data(1856,1888)
data(1888,1920)
data(1920,1952)
data(1952,1984)
data(1984,2016)
data(2016,2048)
data(2

# Dataset Class
**Used to represent and load your data efficiently.**
* Functions :
  + __len__() : which returns total number of samples
  + __init__(): which tells how data should be loaded
  + __getitem__(index) : which returns the data (and labels) at given index

# DataLoader class
**Wraps around the Dataset(instance of the Dataset class) to enable batching, shuffling, and parallel loading.**
* Functions :
  + Provides an iterable to load data in batches.
  + Adds options for multiprocessing (via num_workers) to speed up data
    loading.
  + Facilitates shuffling for randomness during training.


## DataLoader Controlflow
- At the start of each epoch, the DataLoader (if shuffle=True) shuffles indices(using a sampler).
- It divides the indices into chunks of batch size.
- for each index in the chunk, data samples are fetched from the Dataset object
- The samples are then collected and combined into a batch (using collate_fn)
- The batch is returned to the main training loop

In [2]:
from sklearn.datasets import make_classification
import torch

In [3]:
# creating a synthetic classification dataset using sklearn
X, y = make_classification(
    n_samples = 20,
    n_features= 2,
    n_informative=2,
    n_redundant=0,
    n_classes=2,
    random_state=101
)

In [4]:
print(X.shape)
print(X)

(20, 2)
[[-0.47308438  0.99613928]
 [ 1.30317105  1.44025169]
 [ 1.36644255 -1.11888076]
 [ 2.48697156 -0.84539142]
 [-1.6426339  -1.1170949 ]
 [ 0.9033174   0.370202  ]
 [-1.61071056 -1.32647161]
 [-3.37743562 -2.97136039]
 [ 0.15514829 -2.2377797 ]
 [ 0.83469243  0.92326897]
 [ 0.28108924 -0.46174271]
 [ 0.95432647 -0.44756608]
 [-1.85440434  1.06111845]
 [-1.75338318 -1.88378485]
 [ 0.97245713 -0.39322677]
 [-2.76972277 -0.41135523]
 [-0.13438714 -0.99176207]
 [-0.32123354  1.35168151]
 [ 1.14654208 -0.88201415]
 [ 0.34197528 -0.48660878]]


In [5]:
print(y.shape)
print(y)

(20,)
[1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 1]


In [6]:
print(type(X))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [7]:
# convert data to pytorch tensor
X = torch.tensor(X,dtype=torch.float32)
y = torch.tensor(y,dtype=torch.long)

In [8]:
X

tensor([[-0.4731,  0.9961],
        [ 1.3032,  1.4403],
        [ 1.3664, -1.1189],
        [ 2.4870, -0.8454],
        [-1.6426, -1.1171],
        [ 0.9033,  0.3702],
        [-1.6107, -1.3265],
        [-3.3774, -2.9714],
        [ 0.1551, -2.2378],
        [ 0.8347,  0.9233],
        [ 0.2811, -0.4617],
        [ 0.9543, -0.4476],
        [-1.8544,  1.0611],
        [-1.7534, -1.8838],
        [ 0.9725, -0.3932],
        [-2.7697, -0.4114],
        [-0.1344, -0.9918],
        [-0.3212,  1.3517],
        [ 1.1465, -0.8820],
        [ 0.3420, -0.4866]])

In [9]:
y

tensor([1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1])

In [10]:
print(type(X))
print(type(y))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [11]:
from torch.utils.data import Dataset, DataLoader

In [12]:
class customDataset(Dataset):
  def __init__(self,x,y):
    self.x = x
    self.y = y
  def __len__(self):
    return self.x.shape[0]
  def __getitem__(self,index):
    return self.x[index], self.y[index]

In [13]:
dataset = customDataset(X,y)

In [14]:
dataset

<__main__.customDataset at 0x7f3420d18df0>

In [15]:
len(dataset)

20

In [16]:
dataset[1]

(tensor([1.3032, 1.4403]), tensor(1))

In [17]:
print(dataset[1][0])
print(dataset[1][1])

tensor([1.3032, 1.4403])
tensor(1)


In [18]:
for data in dataset:
  print(f"X : {data[0]}, y: {data[1]}")

X : tensor([-0.4731,  0.9961]), y: 1
X : tensor([1.3032, 1.4403]), y: 1
X : tensor([ 1.3664, -1.1189]), y: 0
X : tensor([ 2.4870, -0.8454]), y: 0
X : tensor([-1.6426, -1.1171]), y: 0
X : tensor([0.9033, 0.3702]), y: 1
X : tensor([-1.6107, -1.3265]), y: 1
X : tensor([-3.3774, -2.9714]), y: 0
X : tensor([ 0.1551, -2.2378]), y: 0
X : tensor([0.8347, 0.9233]), y: 1
X : tensor([ 0.2811, -0.4617]), y: 1
X : tensor([ 0.9543, -0.4476]), y: 0
X : tensor([-1.8544,  1.0611]), y: 1
X : tensor([-1.7534, -1.8838]), y: 0
X : tensor([ 0.9725, -0.3932]), y: 0
X : tensor([-2.7697, -0.4114]), y: 1
X : tensor([-0.1344, -0.9918]), y: 0
X : tensor([-0.3212,  1.3517]), y: 1
X : tensor([ 1.1465, -0.8820]), y: 0
X : tensor([ 0.3420, -0.4866]), y: 1


In [19]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [20]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f3420d19900>

In [21]:
for x_batch, y_batch in dataloader:
  print(f"x_batch : {x_batch} \n y_batch : {y_batch}")
  print("="*50)

x_batch : tensor([[-1.6426, -1.1171],
        [-1.7534, -1.8838],
        [ 0.8347,  0.9233],
        [ 1.1465, -0.8820]]) 
 y_batch : tensor([0, 0, 1, 0])
x_batch : tensor([[ 1.3664, -1.1189],
        [ 0.3420, -0.4866],
        [ 0.9725, -0.3932],
        [-1.6107, -1.3265]]) 
 y_batch : tensor([0, 1, 0, 1])
x_batch : tensor([[-3.3774, -2.9714],
        [-0.4731,  0.9961],
        [ 2.4870, -0.8454],
        [-1.8544,  1.0611]]) 
 y_batch : tensor([0, 1, 0, 1])
x_batch : tensor([[ 0.9543, -0.4476],
        [ 1.3032,  1.4403],
        [-0.1344, -0.9918],
        [ 0.9033,  0.3702]]) 
 y_batch : tensor([0, 1, 0, 1])
x_batch : tensor([[ 0.1551, -2.2378],
        [-2.7697, -0.4114],
        [-0.3212,  1.3517],
        [ 0.2811, -0.4617]]) 
 y_batch : tensor([0, 1, 1, 1])


# Working On Real Dataset

In [22]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [23]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [24]:
# removing unnecessary columns
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)

In [25]:
# train test split
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(455, 30)
(114, 30)
(455,)
(114,)


In [27]:
# Scalling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
# scaling factors
print("Means:", scaler.mean_)  # Mean of each column
print("Standard Deviations:", scaler.scale_)

Means: [1.41242879e+01 1.93075604e+01 9.19396703e+01 6.56671429e+02
 9.63678681e-02 1.04421538e-01 8.91330895e-02 4.88284242e-02
 1.81880440e-01 6.29729011e-02 4.01258022e-01 1.21594813e+00
 2.82998967e+00 4.02369121e+01 6.97023297e-03 2.53466484e-02
 3.20967684e-02 1.16982505e-02 2.03582593e-02 3.78691055e-03
 1.62633495e+01 2.57886154e+01 1.07198791e+02 8.83610110e+02
 1.32334022e-01 2.55078505e-01 2.73968246e-01 1.15016442e-01
 2.91142857e-01 8.44703736e-02]
Standard Deviations: [3.62225359e+00 4.27993499e+00 2.48947679e+01 3.63480250e+02
 1.41342510e-02 5.22528208e-02 8.03982305e-02 3.88980010e-02
 2.76230170e-02 7.20564635e-03 2.85064897e-01 5.48878225e-01
 2.05144684e+00 4.81209261e+01 2.91838330e-03 1.77698849e-02
 3.15201184e-02 5.99222487e-03 8.33049058e-03 2.60496267e-03
 4.95037665e+00 6.15500551e+00 3.42256559e+01 5.87464840e+02
 2.23921729e-02 1.54854811e-01 2.10139639e-01 6.57771234e-02
 6.29427445e-02 1.82444544e-02]


In [29]:
# converting labels into numbers using Label encoding
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [30]:
# converting numpy arrays into pytorch tensors
X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)

In [31]:
(X_train_tensor[0]).dtype

torch.float64

In [32]:
from torch.utils.data import Dataset, dataloader

In [33]:
class MyCustomDataset(Dataset):

  def __init__(self,x,y):
    self.x = x
    self.y = y

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self,index):
    return self.x[index], self.y[index]

In [34]:
train_dataset = MyCustomDataset(X_train_tensor, y_train_tensor)
test_dataset = MyCustomDataset(X_test_tensor, y_test_tensor)

In [35]:
train_dataset[0]

(tensor([ 0.0209,  0.2856,  0.0189, -0.1053, -0.4993,  0.1221, -0.4789, -0.4691,
         -1.1324, -0.4001, -0.1875, -0.3606, -0.0395, -0.1917, -0.9811,  0.4239,
         -0.1233,  0.3007, -0.5112, -0.4004,  0.0377,  0.2391,  0.1432, -0.0938,
         -0.7607,  0.5671, -0.1083,  0.2871, -0.5996, -0.3229],
        dtype=torch.float64),
 tensor(0))

In [36]:
test_dataset[0]

(tensor([-0.4871, -0.1793, -0.5194, -0.5226, -0.8206, -0.6942, -0.7799, -0.7614,
         -0.7849, -0.3210, -0.9870, -0.5858, -0.9659, -0.6444, -1.2038, -0.8372,
         -0.6481, -1.0247, -0.9421, -0.9332, -0.6006,  0.2764, -0.6322, -0.5779,
         -0.6223, -0.3796, -0.3820, -0.4652,  0.1137, -0.6917],
        dtype=torch.float64),
 tensor(0))

In [37]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [38]:
# data of first two batches
for i, (fe, la) in enumerate(train_dataloader):
  if i > 1:
    break
  print(f"batch no. : {i+1}")
  print(fe, la)
  print(len(la))
  print("="*50)

batch no. : 1
tensor([[-8.9527e-02, -8.2187e-01, -6.0642e-02, -1.9966e-01,  3.0650e-01,
          4.5124e-01, -1.3984e-01,  4.7858e-02, -5.6766e-01,  3.7292e-01,
         -4.2993e-01, -1.0848e+00, -4.2555e-01, -3.5093e-01, -7.7208e-01,
         -2.6430e-01, -4.6024e-01, -4.1591e-01, -9.7933e-01, -2.5218e-01,
          6.1945e-02, -8.0075e-01,  9.0611e-02, -1.2122e-01,  3.9148e-01,
          6.4009e-01,  1.8710e-02,  3.5398e-01, -5.1226e-01,  1.0156e+00],
        [-6.9688e-01,  1.2109e+00, -7.1138e-01, -6.5883e-01, -1.5302e+00,
         -9.0984e-01, -8.6312e-01, -9.1774e-01,  4.2065e-01, -5.8189e-01,
         -5.2640e-01,  1.0386e+00, -4.2360e-01, -4.5774e-01, -2.9065e-01,
         -1.1124e-01, -5.1195e-01, -7.8439e-01,  1.3903e+00, -6.0496e-01,
         -7.7234e-01,  9.4742e-01, -7.5408e-01, -6.9299e-01, -1.6472e+00,
         -7.6832e-01, -9.5926e-01, -1.0166e+00,  5.2837e-01, -9.3291e-01],
        [-1.2261e+00,  7.5795e-03, -1.2408e+00, -1.0206e+00, -8.1065e-01,
         -1.0358e+00, 

In [39]:
# defining model
import torch.nn as nn


class myDnn(nn.Module):

  def __init__(self, num_features):
    super().__init__()
    self.linear = nn.Linear(num_features,1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    out = self.linear(x)
    out = self.sigmoid(out)

    return out


In [40]:
learning_rate = 0.1
epochs = 30

In [41]:
# create model
model = myDnn(X_train_tensor.shape[1])

param = model.parameters()
Weights = param.__next__()
bias  = param.__next__()

In [42]:
print("weights : ",Weights)
print("bias : ", bias)

weights :  Parameter containing:
tensor([[-0.1116, -0.0676, -0.0502,  0.1039, -0.1040,  0.0873, -0.1026,  0.1086,
          0.1011,  0.1051,  0.1343, -0.0617, -0.0772, -0.0166, -0.1057, -0.0942,
          0.0535, -0.1177,  0.0442,  0.0196, -0.1585, -0.0691, -0.1784,  0.0682,
          0.0110, -0.0267,  0.1419, -0.0015, -0.0150,  0.0438]],
       requires_grad=True)
bias :  Parameter containing:
tensor([0.0474], requires_grad=True)


In [43]:
# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# define loss function
loss_function = nn.BCELoss()

In [44]:
for batch_features, batch_labels in train_dataloader:
  batch_features_1 = batch_features
  batch_labels_1 = batch_labels
  break

In [45]:
batch_labels_1.dtype

torch.int64

In [46]:
# training pipeline

for epoch in range(epochs):
  for batch_features, batch_labels in train_dataloader:

    # forward pass
    y_pred = model(batch_features.float())

    # loss calculation
    loss = loss_function(y_pred, batch_labels.unsqueeze(1).float())

    # backward pass
    optimizer.zero_grad()
    loss.backward()

    # update weights
    optimizer.step()

  # printing loss for each epoch
  print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 0.16976599395275116
Epoch: 2, Loss: 0.13481369614601135
Epoch: 3, Loss: 0.09395307302474976
Epoch: 4, Loss: 0.30530959367752075
Epoch: 5, Loss: 0.027464356273412704
Epoch: 6, Loss: 0.11534486711025238
Epoch: 7, Loss: 0.3490719199180603
Epoch: 8, Loss: 0.052046310156583786
Epoch: 9, Loss: 0.32256120443344116
Epoch: 10, Loss: 0.02475101128220558
Epoch: 11, Loss: 0.06323488056659698
Epoch: 12, Loss: 0.01720363460481167
Epoch: 13, Loss: 0.11956066638231277
Epoch: 14, Loss: 0.02119363285601139
Epoch: 15, Loss: 0.044892605394124985
Epoch: 16, Loss: 0.0772765725851059
Epoch: 17, Loss: 0.09643629938364029
Epoch: 18, Loss: 0.010375335812568665
Epoch: 19, Loss: 0.04302886500954628
Epoch: 20, Loss: 0.005664885975420475
Epoch: 21, Loss: 0.006457589566707611
Epoch: 22, Loss: 0.14947600662708282
Epoch: 23, Loss: 0.013331848196685314
Epoch: 24, Loss: 0.07644462585449219
Epoch: 25, Loss: 0.05906921252608299
Epoch: 26, Loss: 0.013361802324652672
Epoch: 27, Loss: 0.02962912991642952
Epoc

In [68]:
# Model Evaluation

model.eval()
accuracy_list = []

with torch.no_grad():
  for batch_features, batch_labels in test_dataloader:
    y_pred1 = model(batch_features.float())
    y_pred = (y_pred1 > 0.6).float()
    batch_accuracy = (y_pred == batch_labels.unsqueeze(1).float()).float().mean()
    accuracy_list.append(batch_accuracy)

# overall accuracy
accuracy = torch.tensor(accuracy_list).mean()
print(f"Overall Accuracy: {accuracy}")

Overall Accuracy: 0.984375
