$${\color{yellow}{\text{Applied Linear Algebra: Bias trick}}}$$



---

Load essential libraries

---

In [12]:
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
torch.set_printoptions(precision = 3, sci_mode = False)

---

Mount Google Drive folder if running Google Colab

---

In [13]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/OddSem2025MAHE/Share/Applied Linear Algebra'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = '../Data/'

---

Setting up the patient data matrix, true output labels, and the initial weights matrix for the softmax classifier:

![data for softmax](https://1drv.ms/i/s!AjTcbXuSD3I3hspfrgklysOtJMOjaA?embed=1&width=800)

---

In [14]:
# Create the data matrix (read from a file typically)
X = np.array([[72, 120, 37.3, 104, 32.5],
              [85, 130, 37.0, 110, 14],
              [68, 110, 38.5, 125, 34],
              [90, 140, 38.0, 130, 26],
              [84, 132, 38.3, 146, 30],
              [78, 128, 37.2, 102, 12]])
print(f'Data matrix: \n{X}')

# Standardize the data matrix
sc = StandardScaler() 
X_S = sc.fit_transform(X)
print(f'Standardized data matrix: \n{X_S}')

# Convert to a PyTorch tensor with the standardized values for the data
X_S = torch.tensor(X_S,dtype=torch.float64)

# Get the number of samples and features
num_samples, num_features = X_S.shape

# Create the output labels vector (also read from a file typically)
y = np.array(['non-diabetic',
              'diabetic',
              'non-diabetic',
              'pre-diabetic',
              'diabetic',
              'pre-diabetic'])

# One-hot encoding of output labels using scikit-learn
ohe = OneHotEncoder(sparse_output = False)
Y = ohe.fit_transform(y.reshape(-1,1))

# Convert to a PyTorch tensor
Y = torch.tensor(Y)

# Get the number of labels
num_labels = Y.shape[1]

# Create the weights matrix
W = torch.tensor([[-0.1, 0.5, 0.3],
                  [0.9, 0.3, 0.5],
                  [-1.5, 0.4, 0.1],
                  [0.1, 0.1, -1.0],
                  [-1.2, 0.5, -0.8]], dtype = torch.float64)

print(Y)
print(W)

Data matrix: 
[[ 72.  120.   37.3 104.   32.5]
 [ 85.  130.   37.  110.   14. ]
 [ 68.  110.   38.5 125.   34. ]
 [ 90.  140.   38.  130.   26. ]
 [ 84.  132.   38.3 146.   30. ]
 [ 78.  128.   37.2 102.   12. ]]
Standardized data matrix: 
[[-0.979883   -0.70186241 -0.72380201 -0.98707429  0.89204786]
 [ 0.71858087  0.3509312  -1.24493946 -0.60498101 -1.2373567 ]
 [-1.50248727 -1.75465602  1.36074779  0.35025217  1.06470228]
 [ 1.3718362   1.40372481  0.49218537  0.66866323  0.14387869]
 [ 0.5879298   0.56148993  1.01332282  1.68757862  0.60429048]
 [-0.1959766   0.14037248 -0.8975145  -1.11443871 -1.4675626 ]]
tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.]], dtype=torch.float64)
tensor([[-0.100,  0.500,  0.300],
        [ 0.900,  0.300,  0.500],
        [-1.500,  0.400,  0.100],
        [ 0.100,  0.100, -1.000],
        [-1.200,  0.500, -0.800]], dtype=torch.float64)


---

Bias trick to absorb the bias into the weights matrix with
- ${\color{yellow}6}$ samples,
- ${\color{green}5}$ features,
- ${\color{cyan}3}$ neurons.

$$\begin{align*}\mathbf{X} &=
\underbrace{\begin{bmatrix}
72 & 120 & 37.3 & 104 & 32.5 \\
85 & 130 & 37.0 & 110 & 14 \\
68 & 110 & 38.5 & 125 & 34 \\
90 & 140 & 38.0 & 130 & 26 \\
84 & 132 & 38.3 & 146 & 30 \\
78 & 128 & 37.2 & 102 & 12 \\
\end{bmatrix}}_{\text{original }{\color{yellow}6}\times{\color{green}5}\text{-data matrix}}\Rightarrow \mathbf{X}_B =
\underbrace{\begin{bmatrix}
72 & 120 & 37.3 & 104 & 32.5 & \color{magenta}{1} \\
85 & 130 & 37.0 & 110 & 14   & \color{magenta}{1} \\
68 & 110 & 38.5 & 125 & 34   & \color{magenta}{1} \\
90 & 140 & 38.0 & 130 & 26   & \color{magenta}{1} \\
84 & 132 & 38.3 & 146 & 30   & \color{magenta}{1} \\
78 & 128 & 37.2 & 102 & 12   & \color{magenta}{1} \\
\end{bmatrix}}_{\text{bias feature-added }{\color{yellow}6}\times({\color{green}5}+{\color{magenta}1})\text{-data matrix}}
\end{align*}$$

$$\begin{align*}\mathbf{W} &=
\underbrace{\begin{bmatrix}
-0.1 & 0.5 & 0.3 \\
0.9 & 0.3 & 0.5 \\
-1.5 & 0.4 & 0.1 \\
0.1 & 0.1 & -1.0 \\
-1.2 & 0.5 & -0.8 \\
\end{bmatrix}}_{\text{original }{\color{green}5}\times{\color{cyan}3}\text{-weights matrix}}
\Rightarrow \mathbf{W}_B=\underbrace{\begin{bmatrix}
-0.1 & 0.5 & 0.3 \\
0.9 & 0.3 & 0.5 \\
-1.5 & 0.4 & 0.1 \\
0.1 & 0.1 & -1.0 \\
-1.2 & 0.5 & -0.8 \\
\color{magenta}{b_0} & \color{magenta}{b_1} & \color{magenta}{b_2} \\
\end{bmatrix}}_{\text{bias row-added }({\color{green}5}+{\color{magenta}1})\times{\color{cyan}3}\text{-weights matrix}}\end{align*}$$


---

In [15]:
# X_SB = torch.cat([X_S,torch.ones(num_samples,1)],dim=1)
# X_SB

In [16]:
b = torch.cat([W,0.01*torch.ones(1,num_labels)],dim=0)
b

tensor([[-0.100,  0.500,  0.300],
        [ 0.900,  0.300,  0.500],
        [-1.500,  0.400,  0.100],
        [ 0.100,  0.100, -1.000],
        [-1.200,  0.500, -0.800],
        [ 0.010,  0.010,  0.010]], dtype=torch.float64)

In [17]:
## Bias trick to absorb the bias into the weights matrix
# Concatenate a column of ones to X_S (bias term)
X_B = torch.cat([X_S,torch.ones(num_samples,1)],dim=1)

# Create the bias vector `b`
W_B = torch.cat([W,0.01*torch.ones(1,num_labels)],dim=0)

# Concatenate the weights matrix `W` with the bias vector `b`
print(X_B)
print(W_B)

tensor([[-0.980, -0.702, -0.724, -0.987,  0.892,  1.000],
        [ 0.719,  0.351, -1.245, -0.605, -1.237,  1.000],
        [-1.502, -1.755,  1.361,  0.350,  1.065,  1.000],
        [ 1.372,  1.404,  0.492,  0.669,  0.144,  1.000],
        [ 0.588,  0.561,  1.013,  1.688,  0.604,  1.000],
        [-0.196,  0.140, -0.898, -1.114, -1.468,  1.000]], dtype=torch.float64)
tensor([[-0.100,  0.500,  0.300],
        [ 0.900,  0.300,  0.500],
        [-1.500,  0.400,  0.100],
        [ 0.100,  0.100, -1.000],
        [-1.200,  0.500, -0.800],
        [ 0.010,  0.010,  0.010]], dtype=torch.float64)


---

Forward propagation for the toy patient dataset: $$\textbf{bias-added input }\mathbf{X}_B\,{\color{yellow}\longrightarrow}\,\textbf{raw scores }\mathbf{Z}=\mathbf{X}_B\textbf{W}_B\,{\color{yellow}\longrightarrow}\,\textbf{softmax activated scores }\mathbf{A}=\text{softmax}(\mathbf{Z}).$$

---

In [18]:
torch.matmul(X_B,W_B)

tensor([[-0.607, -0.633, -0.434],
        [ 3.546, -0.703,  1.871],
        [-4.703, -0.156, -2.384],
        [ 0.292,  1.453,  0.389],
        [-1.620,  1.349, -1.603],
        [ 3.152, -1.250,  2.220]], dtype=torch.float64)

In [19]:
# Raw scores matrix
Z = torch.matmul(X_B,W_B) # also referred to as the logits values
print(Z)

# Softmax activated scores
A = F.softmax(Z,dim=1)

# Predicted probabilities for each sample
print(A)

# True output label for each sample
print(Y)

tensor([[-0.607, -0.633, -0.434],
        [ 3.546, -0.703,  1.871],
        [-4.703, -0.156, -2.384],
        [ 0.292,  1.453,  0.389],
        [-1.620,  1.349, -1.603],
        [ 3.152, -1.250,  2.220]], dtype=torch.float64)
tensor([[0.316, 0.308, 0.376],
        [0.832, 0.012, 0.156],
        [0.009, 0.894, 0.096],
        [0.189, 0.603, 0.208],
        [0.047, 0.906, 0.047],
        [0.711, 0.009, 0.280]], dtype=torch.float64)
tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.]], dtype=torch.float64)


---

Loss for each sample can be quantified using the categorical crossentropy (CCE) loss function which is defined as $$\color{yellow}{-\log(\text{predicted probability that a sample belongs its correct class})}$$

For example, consider a sample with

- true label = [$\color{yellow}{1}$ 0 0]
- predicted probabilities = [$\color{yellow}{0.05}$, 0.99, 0.05]

$\Rightarrow$ categorical crossentropy loss = $-\log(\color{yellow}{0.05}).$

Here, we calculate the CCE loss for all the samples and average them out.

---

In [20]:
torch.argmax(Y,dim=1)

tensor([1, 0, 1, 2, 0, 2])

In [21]:
## Calculate average CCE loss
loss = torch.mean(-torch.log(torch.sum(Y*A,dim=1)))
print(loss)

# Using the PyTorch in-built function for CCE loss
loss_fn = torch.nn.CrossEntropyLoss()
loss_fn(Z,torch.argmax(Y,dim=1))

tensor(1.230, dtype=torch.float64)


tensor(1.230, dtype=torch.float64)

In [22]:
print(Y)
print(Y*A)
print(torch.sum(Y*A,dim=1))
print(-torch.log(torch.sum(Y*A,dim=1)))
print(torch.mean(-torch.log(torch.sum(Y*A,dim=1))))


tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.]], dtype=torch.float64)
tensor([[0.000, 0.308, 0.000],
        [0.832, 0.000, 0.000],
        [0.000, 0.894, 0.000],
        [0.000, 0.000, 0.208],
        [0.047, 0.000, 0.000],
        [0.000, 0.000, 0.280]], dtype=torch.float64)
tensor([0.308, 0.832, 0.894, 0.208, 0.047, 0.280], dtype=torch.float64)
tensor([1.177, 0.184, 0.112, 1.570, 3.067, 1.273], dtype=torch.float64)
tensor(1.230, dtype=torch.float64)
