# Simple XOR with binary

### Model XOR

In [None]:
import torch
import torch.nn as nn

class xor_mlp(nn.Module):
    def __init__(self, input_size = 2, output_size = 1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = 80 # increasing the hidden_size helped to improve the performance. But removing the ReLU and increasing the number of layers not.
        self.output_size = output_size

        self.mlp_layers = nn.Sequential(
            # input_layer
            nn.Linear(self.input_size, self.hidden_size), nn.ReLU(), # %shortcut: click + option to | | | | 
            
            # hidden_layers
            nn.Linear(self.hidden_size, self.hidden_size),nn.ReLU(),
            nn.Linear(self.hidden_size, self.hidden_size),nn.ReLU(),
            nn.Linear(self.hidden_size, self.hidden_size),nn.ReLU(),

            # output_layer
            nn.Linear(self.hidden_size, self.output_size), # -> we use Sigmoid because XOR is binary classificaiton
            nn.Sigmoid() # convert the probability beteweeb 0 and 

        )

    def forward(self, x):
        out = self.mlp_layers(x)
        return out
    
# init the model
# model = xor_mlp()

# example of flow: input(tensor.shape(4))
# :::
# x = torch.tensor([0.1, 0.2])
# model(x)
# :::

### Dataset preparation

In [101]:
from torch.utils.data import Dataset, DataLoader

# XOR input pairs
td_x = torch.tensor([[0.0, 0.0],
                    [0.0, 1.0],
                    [1.0, 0.0],
                    [1.0, 1.0]])

# XOR outputs: 0^0=0, 0^1=1, 1^0=1, 1^1=0
td_y = torch.tensor([0.0, 1.0, 1.0, 0.0,])


class Toy_dataset_xor(Dataset):
    def __init__(self, x, y):
        self.input_f = x
        self.labels_y = y

    def __len__(self):
        return self.labels_y.shape[0]


    def __getitem__(self, index):
        x = self.input_f[index]
        y = self.labels_y[index]
        return x, y.unsqueeze(0)



dataset_1 = Toy_dataset_xor(td_x, td_y)

train_loader = DataLoader(
    dataset=dataset_1,
    batch_size=4,
    shuffle=True,
    num_workers=0
)

# for batch_idx, (features, labels) in enumerate(train_loader):
#     logits = model(features) 
#     print(f"{batch_idx} \n")    
#     print(f"{features} \n")
#     print(f"{labels} \n")

### Train_loop

In [None]:
# 1 Sample of train loop
# init
model = xor_mlp()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params = model.parameters(), lr=0.01)

# train
model.train()
logits = model(torch.tensor([1., 0.]))
loss = criterion(logits, torch.tensor([0.]))
optimizer.zero_grad() # make sure to keep only necessary grads
loss.backward()
optimizer.step()
print(loss.item())
model.eval()

In [155]:
model = xor_mlp()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params = model.parameters(), lr=0.01)

EPOCHS=20
model.train()
idx = 0
for epoch in  range(EPOCHS):
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        idx += 1
        print(idx)
        print(loss.item())
model.eval()

# loss = 0.5 is actually 100% probability 
# model.eval()
# x = torch.tensor([1.0, 0.0]) # so if we will specify [5.0, 0.0] -> it will also give us the correct answer if we imagint 5 = 1 and basically model learned this pattern
# model(x)

1
0.727141261100769
2
0.7214694023132324
3
0.7145631313323975
4
0.7032269835472107
5
0.6885955929756165
6
0.6801382899284363
7
0.6715609431266785
8
0.6466799974441528
9
0.6182271242141724
10
0.5990081429481506
11
0.5718968510627747
12
0.5591419339179993
13
0.5453067421913147
14
0.5305672883987427
15
0.5169280171394348
16
0.508049488067627
17
0.5043385028839111
18
0.5033983588218689
19
0.5032436847686768
20
0.5032377243041992


xor_mlp(
  (mlp_layers): Sequential(
    (0): Linear(in_features=2, out_features=80, bias=True)
    (1): ReLU()
    (2): Linear(in_features=80, out_features=80, bias=True)
    (3): ReLU()
    (4): Linear(in_features=80, out_features=80, bias=True)
    (5): ReLU()
    (6): Linear(in_features=80, out_features=80, bias=True)
    (7): ReLU()
    (8): Linear(in_features=80, out_features=1, bias=True)
    (9): Sigmoid()
  )
)

# Start of the Gemma training

In [2]:
from transformers import PretrainedConfig, PreTrainedModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
x = torch.tensor([0.5, 0.1])
nn.Softmax(x)

Softmax(dim=tensor([0.5000, 0.1000]))

In [57]:
# Data
#  input
x = dataset_1[0][0]
x
y = dataset_1[0][1]
y



tensor([0.])

In [45]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.5)
logits = model(x)
logits

tensor([0.4593], grad_fn=<SigmoidBackward0>)

In [43]:
logits

tensor([0.4593], grad_fn=<SigmoidBackward0>)

In [42]:
y

tensor(0.)

In [59]:
x.shape

torch.Size([2])

In [58]:
y.shape

torch.Size([1])

In [68]:
logits

tensor([0.], grad_fn=<SigmoidBackward0>)

In [69]:
y

tensor([0.])

0.6931471824645996


In [None]:

# # sample of loss

# import torch.nn.functional as F 
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.5)
# logits = model(x)
# loss = F.cross_entropy(logits, y)
# optimizer.zero_grad() # make sure to keep only necessary grads
# loss.backward()
# optimizer.step()
# print(loss.item())  # ✅ безопасно: .item() извлекает значение

# # optimizer.zero_grad()
# # loss.backward() # that will only create the gradients
# # optimizer.step() # Wi = Wi - lr * grad ||| W1 = -0.6059 - 0.5 * 0.0027
# # grad defined based on the loss.
# # Wi = Wi - lr * grad
# # Bi = Bi - lr * grad


In [111]:
dataset_1[0][0]

tensor([0., 0.])