In [1]:
import torch 

tensor2d_1 = torch.tensor([[1,2,3], [4,5,6]])
print(tensor2d_1)
tensor2d_2 = torch.tensor([[1,2,3], [4,5,6]])
print(tensor2d_2)
print(tensor2d_1 @ tensor2d_2.T)

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[14, 32],
        [32, 77]])


## Pytorch is your friend 🤗

1. Tensors -> data structure
2. Automatic differentiation engine -> backprop
3. Deep learning framework -> training and inference

### Answer! 🤔

? What is a tensor?

? What is autograd?

? Efficient data loading? это как?

### Tensors

We use them because they efficiently represent data and operations on it. 

And we can quickly move data from CPU to GPU.

### DL framework

1. Pre-trained models
2. Loss functions
3. Optimizers.

In [1]:
import torch

torch.__version__
torch.cuda.is_available()

True

In [2]:
# tensor0d = torch.tensor(1)
# print(tensor0d.dtype)
# torchvec =tensor0d.to(torch.bfloat16)
# print(torchvec.dtype)
# print(torchvec)

# tensor0d_float = torch.tensor(1.0)
# print(tensor0d_float.dtype)

# tensor1d = torch.tensor([1,2,3])
# print(tensor1d.dtype)

tensor2d = torch.tensor([[1,2,3], [4,5,6]])
# print(tensor2d)
# print(tensor2d.T)
# What is the point of .reshape? it will create a wrong order hmm
tensor2d = tensor2d.reshape(1,1,3,2)
print(tensor2d.shape)
# x = tensor2d.reshape(-1) 
# print(x)



# tensor3d = torch.tensor([[[1,2,3], [4,5,6], [7,8,9]]])
# print(tensor3d)

torch.Size([1, 1, 3, 2])


In [3]:
tensor2d_1 = torch.tensor([[1,2,3], [4,5,6]])

tensor2d_2 = torch.tensor([[1,2,3], [4,5,6]])

tensor2d_1 @ tensor2d_2.T # tran
# [1, 2, 3]
# [4, 5, 6]

# [1, 4]
# [2, 5]
# [3, 6]

# r[1, 2, 3] * c[1, 2, 3]
# r[1, 2, 3] * c[4, 5, 6]

# r[4, 5, 6] * c[1, 2, 3]
# r[4, 5, 6] * c[4, 5, 6]

# [14, 32]
# [32, 77]

tensor([[14, 32],
        [32, 77]])

In [4]:
tensor2d_2.T

tensor([[1, 4],
        [2, 5],
        [3, 6]])

In [5]:
import torch.nn.functional as F
from torch.autograd import grad

a = torch.tensor([0.9], requires_grad=True)
y = torch.tensor([1.0])

loss = F.binary_cross_entropy(a, y)
print(loss)
grad_L_a = grad(loss, a)[0]
print(grad_L_a)

# if gradient is > 0, then we need to ⬆️ increase weights
# if gradient is < 0, then we need to decrease ⬇️ weights

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = w1 * x1 + b
a = torch.sigmoid(z)
loss = F.binary_cross_entropy(a, y)

print(loss)

grad_L_w = grad(loss, w1, retain_graph=True)
grad_L_b = grad(loss, b, retain_graph=True)

print(grad_L_w)
print(grad_L_b)

loss.backward()

tensor(0.1054, grad_fn=<BinaryCrossEntropyBackward0>)
tensor([-1.1111])
tensor(0.0852, grad_fn=<BinaryCrossEntropyBackward0>)
(tensor([-0.0898]),)
(tensor([-0.0817]),)


### Multilayer perceptron (MLP)

In [6]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(
            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs),
        )
    
    def forward(self, x):
        logits = self.layers(x)
        return logits

torch.manual_seed(77714791777)

model = NeuralNetwork(num_inputs=50, num_outputs=3)
model.to(torch.bfloat16)
# model.layers[0].weight
# model.layers[0].weight.dtype

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)

In [7]:
model.layers[0].weight.dtype

torch.bfloat16

In [8]:
some_shape = (1, 50)
x = torch.randn(some_shape)
x = x.to(torch.bfloat16)
out = model(x)
print(out)

# when we don't want to compute and save gradients
with torch.no_grad():
    out = model(x) 
print(out)

# apply softmax to the output
out = torch.softmax(out, dim=1)
print(out)


tensor([[-0.1621,  0.1758,  0.1777]], dtype=torch.bfloat16,
       grad_fn=<AddmmBackward0>)
tensor([[-0.1621,  0.1758,  0.1777]], dtype=torch.bfloat16)
tensor([[0.2617, 0.3691, 0.3691]], dtype=torch.bfloat16)


In [9]:
x.grad

In [10]:
model.layers[0]

Linear(in_features=50, out_features=30, bias=True)

### Data loader

In [11]:
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
     ])

print(X_train.shape)
y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6]
])

y_test = torch.tensor([0, 1])

torch.Size([5, 2])


Sending request to the model...
✅ Model decided to call a tool.
Function to call: schedule_meeting
Arguments: {'participants': ['me', 'Vladimir'], 'title': 'Qwen3 Project Plan', 'time': '2023-09-25T14:00:00', 'location': 'Main Conference Room', 'duration_minutes': 45}
--- Calling schedule_meeting() ---

--- Result ---
Meeting Scheduled Successfully!
	Title: Qwen3 Project Plan
	Time: 2023-09-25T14:00:00
	Participants: me, Vladimir
	Location: Main Conference Room
	Duration: 45 minutes


In [12]:
y_test.weight

AttributeError: 'Tensor' object has no attribute 'weight'

In [13]:
x_temp = X_train[0]
x_temp 

tensor([-1.2000,  3.1000])

In [14]:
from torch.utils.data import Dataset 

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X 
        self.labels = y
    
    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y
    
    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)
print(len(train_ds))

x0, y0 = train_ds[0]

# x0
x0
# y0

5


tensor([-1.2000,  3.1000])

In [17]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=0
)
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[-1.2000,  3.1000],
        [ 2.3000, -1.1000]]) tensor([0, 1])
Batch 2: tensor([[-0.5000,  2.6000],
        [-0.9000,  2.9000]]) tensor([0, 0])
Batch 3: tensor([[ 2.7000, -1.5000]]) tensor([1])


In [19]:
# In practice, having a substantially smaller batch as the 
# last batch in a training epoch can disturb the convergence 
# during training. To prevent this, it’s recommended to set drop_last=True,
# which will drop the last batch in each epoch, as shown below:
# drop_last=True
train_loader = DataLoader( 
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
)

for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[-0.5000,  2.6000],
        [-1.2000,  3.1000]]) tensor([0, 0])
Batch 2: tensor([[ 2.7000, -1.5000],
        [-0.9000,  2.9000]]) tensor([1, 0])


### Actual training loop

In [None]:
from transformers import Gemma3ForCausalLM

In [1]:
import json
from openai import OpenAI

# It's good practice to use environment variables for API keys
# For this example, we'll use a placeholder.
# from dotenv import load_dotenv
# import os
# load_dotenv()
# OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="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", # Replace with your key
)

# --- Step 1: Define your tool and the function it calls ---

# This is the actual Python function that will be executed.
def get_current_weather(location, unit="celsius"):
    """Get the current weather in a given location."""
    # In a real application, you would call a weather API here.
    # For this example, we'll return mock data.
    if "astana" in location.lower():
        weather_info = {
            "location": location,
            "temperature": "-15",
            "unit": unit,
            "forecast": ["snowy", "windy", "cold"],
        }
    else:
        weather_info = {
            "location": location,
            "temperature": "22",
            "unit": unit,
            "forecast": ["sunny", "mild"],
        }
    return json.dumps(weather_info)

# --- Step 2: Make the first API call with the tools defined ---

# The user's prompt that should trigger the tool.
user_prompt = "What's the weather like in Astana?"
print(f"👤 User: {user_prompt}\n")

messages = [{"role": "user", "content": user_prompt}]

# Describe your tool in the JSON schema format the model expects.
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g., San Francisco, CA",
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"]
                    },
                },
                "required": ["location"],
            },
        },
    }
]

# First API call
completion = client.chat.completions.create(
  model="qwen/qwen3-235b-a22b-2507",
  messages=messages,
  tools=tools,
  tool_choice="auto", # 'auto' lets the model decide, or you can force a tool call.
)

response_message = completion.choices[0].message
tool_calls = response_message.tool_calls

# --- Step 3: Check if the model wants to call a tool and execute it ---

if tool_calls:
    print("🤖 Model wants to call a tool...")
    print(f"Tool calls: {tool_calls}\n")
    
    # Append the assistant's message with tool calls to the conversation history
    messages.append(response_message)
    
    # In this example, we'll use a mapping to find the correct function.
    available_functions = {
        "get_current_weather": get_current_weather,
    }
    
    # Loop through each tool call the model requested
    for tool_call in tool_calls:
        function_name = tool_call.function.name
        function_to_call = available_functions[function_name]
        function_args = json.loads(tool_call.function.arguments)
        
        print(f"Executing function: {function_name}({function_args})")
        
        # Call the actual function with the arguments provided by the model
        function_response = function_to_call(
            location=function_args.get("location"),
            unit=function_args.get("unit"),
        )
        
        print(f"Function response: {function_response}\n")

        # --- Step 4: Send the function's response back to the model in a second call ---
        
        # Append the tool's response to the conversation history
        messages.append(
            {
                "tool_call_id": tool_call.id,
                "role": "tool",
                "name": function_name,
                "content": function_response,
            }
        )

    print("📢 Sending tool response back to the model...")
    
    # Second API call
    second_response = client.chat.completions.create(
        model="qwen/qwen3-235b-a22b-2507",
        messages=messages, # Send the whole conversation history
    )
    
    final_message = second_response.choices[0].message.content
    print(f"\n✅ Final Model Response:\n{final_message}")

else:
    # If the model didn't call a tool, just print its response
    print(f"🤖 Model (no tool call):\n{response_message.content}")

👤 User: What's the weather like in Astana?

🤖 Model wants to call a tool...
Tool calls: [ChatCompletionMessageFunctionToolCall(id='call_db3de021fc434036988cfe', function=Function(arguments='{"location": "Astana", "unit": "celsius"}', name='get_current_weather'), type='function', index=0)]

Executing function: get_current_weather({'location': 'Astana', 'unit': 'celsius'})
Function response: {"location": "Astana", "temperature": "-15", "unit": "celsius", "forecast": ["snowy", "windy", "cold"]}

📢 Sending tool response back to the model...

✅ Final Model Response:
The current weather in Astana is cold, with a temperature of -15°C. It is snowy and windy, so make sure to dress warmly if you're planning to go outside.


In [None]:
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset

class MLP_neural_network(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(num_inputs, 30), 
            nn.ReLU(),

            nn.Linear(30, 20),
            nn.ReLU(),

            nn.Linear(20, num_outputs)
        )
    
    def forward(self, x):
        logits = self.layers(x)
        return logits

torch.manual_seed(1477741)

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X 
        self.labels = y

    def __getitem__(self, index):
        out_x = self.features[index]
        out_y = self.labels[index]
        return out_x, out_y 
    
    def __len__(self):
        return self.labels.shape[0]
    
# toy data 
# train_data
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.6, 2.1],
    [2.1, -2.1],
    [3.1, -1.4]
])
y_train = torch.tensor([0, 0, 0, 1, 1])

# test_data  
# shift + option + arrow_up or arrow_down  
#  
#  
#  
X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6]
])
y_test = torch.tensor([0, 1])

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=2,
    drop_last=True
)


test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=2
)


model = MLP_neural_network(num_inputs=2, num_outputs=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.5)

NUM_EPOCHS = 3
for epoch in range(NUM_EPOCHS):
    
    model.train() # set model to training mode
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)

        loss = F.cross_entropy(logits, labels) # loss function

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### LOGS
        print(
            f"Epoch {epoch+1:03d}/{NUM_EPOCHS:03d}"
            f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
            f" | Train/Val Loss: {loss:.2f}"
        )

    model.eval()

Epoch 001/003 | Batch 000/002 | Train/Val Loss: 1.02
Epoch 001/003 | Batch 001/002 | Train/Val Loss: 1.85
Epoch 002/003 | Batch 000/002 | Train/Val Loss: 0.07
Epoch 002/003 | Batch 001/002 | Train/Val Loss: 0.03
Epoch 003/003 | Batch 000/002 | Train/Val Loss: 0.02
Epoch 003/003 | Batch 001/002 | Train/Val Loss: 0.00


In [4]:
import torch

temp_1 = torch.tensor([[ 3.2846, -3.1264],
        [ 2.9755, -2.8467],
        [ 2.0955, -2.1111],
        [-2.9865,  1.8815],
        [-2.8769,  1.7912]])
pred = torch.argmax(temp_1, dim=1)
print(pred)
# tensor([0, 0, 0, 1, 1])

tensor([0, 0, 0, 1, 1])


In [2]:
model.eval()

with torch.no_grad():
    outputs = model(X_train)

print(outputs)

predictions = torch.argmax(outputs, dim=1)
print(predictions)

# torch.sum(predictions == y_test)

tensor([[ 3.2846, -3.1264],
        [ 2.9755, -2.8467],
        [ 2.0955, -2.0311],
        [-2.9865,  1.8815],
        [-2.8769,  1.7912]])
tensor([0, 0, 0, 1, 1])


About softmax 51078.png

In [7]:
def compute_accuracy(model, dataloader):
    model = model.eval()
    correct = 0.0
    total_examples = 0 

    for idx, (features, labels) in enumerate(dataloader):

        with torch.no_grad():
            logits = model(features)
        
        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        total_examples += len(compare)
    
    return (correct / total_examples).item()

compute_accuracy(model, test_loader)






1.0

## Saving and loading models

In [9]:
# torch.save(model.state_dict(), "model.pth")
!du -sh /home/vladimir_albrekht/LLMs-from-scratch/pytorch_is_easy/model.pth

8,0K	/home/vladimir_albrekht/LLMs-from-scratch/pytorch_is_easy/model.pth


#### Restore model      

In [10]:
model = MLP_neural_network(num_inputs=2, num_outputs=2)
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

### GPUs


In [None]:
tensor_1 = torch.tensor([1, 2, 3])
tensor_1 = tensor_1.to("cuda")
tensor_1 = tensor_1.to(torch.bfloat16)
tensor_2 = torch.tensor([4, 5, 6])
tensor_2 = tensor_2.to("cuda:0")
tensor_2 = tensor_2.to(torch.bfloat16)

print(tensor_1 @ tensor_2) # 4 + 10 + 18 = 32

tensor(32., device='cuda:0', dtype=torch.bfloat16)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

class Our_network(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(num_inputs, 10),
            nn.ReLU(),

            nn.Linear(10, 20),
            nn.ReLU(),
            nn.Linear(20, num_outputs)
        )
    
    def forward(self, x):
        logits = self.layers(x)
        return logits

torch.manual_seed(1477741)

class Our_dataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __getitem__(self, index):
        out_x = self.features[index]
        out_y = self.labels[index]
        return out_x, out_y 
    
    def __len__(self):
        return self.labels.shape[0]
    
# our data
X_train = torch.tensor([
    [-1.0, 4.2],
    [-2.0, 1.0],
    [-3.0, 2.0],
    [4.0, -3.0],
    [5.0, -4.0],
    [6.0, -5.0],
])

y_train = torch.tensor([0, 0, 0, 1, 1, 1])

X_test = torch.tensor([
    [-5.0, 4.2],
    [3.0, -1.2],
])

y_test = torch.tensor([0, 1])

train_dataset = Our_dataset(X_train, y_train)
test_dataset = Our_dataset(X_test, y_test)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
    num_workers=2,
    drop_last=True
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=2,
    shuffle=False,
    num_workers=2
)

model = Our_network(num_inputs=2, num_outputs=2)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# model.to(torch.bfloat16)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
NUM_EPOCHS = 3 

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)
        logits = model(features)

        loss = F.cross_entropy(logits, labels)
        
        optimizer.zero_grad() # we accumulate the grad so to make sure we will don't have grad_1 + grad_2
        loss.backward()
        optimizer.step() # update the weights
        # Wi = Wi - lr * grad
        
        print(
            f"Epoch {epoch+1:03d}/{NUM_EPOCHS:03d}"
            f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
            f" | Train/Val Loss: {loss:.2f}"
        )

model.eval()

def compute_accuracy(model, dataloader):
    model = model.eval()
    correct = 0.0 
    total_examples = 0 
    
    for idx, (features, labels) in enumerate(dataloader):
        features, labels = features.to(device), labels.to(device)
        with torch.no_grad():
            logits = model(features)
        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        total_examples += len(compare)
    
    return (correct / total_examples).item()

compute_accuracy(model, train_loader)
compute_accuracy(model, test_loader)

X_train = X_train.to(device)
y_train = y_train.to(device)
with torch.no_grad():
    outputs = model(X_train)

print(outputs)

predictions = torch.argmax(outputs, dim=1)
print(predictions)

torch.sum(predictions == y_train)

Epoch 001/003 | Batch 000/003 | Train/Val Loss: 0.78
Epoch 001/003 | Batch 001/003 | Train/Val Loss: 0.58
Epoch 001/003 | Batch 002/003 | Train/Val Loss: 0.31
Epoch 002/003 | Batch 000/003 | Train/Val Loss: 0.03
Epoch 002/003 | Batch 001/003 | Train/Val Loss: 0.04
Epoch 002/003 | Batch 002/003 | Train/Val Loss: 0.04
Epoch 003/003 | Batch 000/003 | Train/Val Loss: 0.01
Epoch 003/003 | Batch 001/003 | Train/Val Loss: 0.00
Epoch 003/003 | Batch 002/003 | Train/Val Loss: 0.02
tensor([[ 4.1386, -2.9504],
        [ 2.1535, -1.6710],
        [ 3.5200, -2.5943],
        [-2.2951,  2.0023],
        [-2.8063,  2.4873],
        [-3.3176,  2.9723]], device='cuda:0')
tensor([0, 0, 0, 1, 1, 1], device='cuda:0')


tensor(6, device='cuda:0')

In [None]:
### Multi GPU training

import platform
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group # to initialize and destroy the distributed training mods

### 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


#@ model 
class SimpleNetwork_v2(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(num_inputs, 10),
            nn.ReLU(),
            nn.Linear(10, 20),
            nn.ReLU(),
            nn.Linear(20, num_outputs)
        )
        # self.l1 = nn.Linear(num_inputs, 10)
        # self.r = nn.ReLU()
        # self.l2 = nn.Linear(10,20)
        # self.l3 = nn.Linear(20, num_outputs)

    
    def forward(self, x):
        # x = self.l1(x)
        # x = self.r(x)
        # x = self.l2(x)
        # x = self.r(x)
        # logits = self.l3(x)
        logits = self.layers(x)
        return logits

model = SimpleNetwork_v2(num_inputs=2, num_outputs=2)
optimizer = torch.optim.AdamW(model.parameters(), betas=(0.9, 0.95), lr=0.5, weight_decay=0.1) # it's params 2e-4 = 0.0002 2e-4 -> on the forth index after the comma
#optimizer = torch.optim.SGD(model.parameters(), lr=0.5) # interesting that lr is affects how fast model will reach the loss = 0
# that is because Wi = Wi - lr * grad | and if lr is large and let's say our gradient is -0.5 -> then we update weights like Wi = Wi + 0.5 * 0.5
#@ dataset_class specific for the model

class SimpleData_v2(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        out_feature = self.features[index]
        out_label = self.labels[index]
        return out_feature, out_label
    
    def __len__(self):
        return self.labels.shape[0]

# training_data

features_train = torch.tensor([
    [-0.1, 5.4],
    [-0.6, 6.2],
    [-1.0, 2.4],
    [0.7, -0.9],
    [1.2, -0.6],
    [1.3, -0.7]
])

labels_train = torch.tensor([0, 0, 0, 1, 1, 1])

train_data = SimpleData_v2(features_train, labels_train)
test_data = SimpleData_v2(features_train, labels_train)

train_loader = DataLoader(
    dataset=train_data,
    batch_size=2,
    shuffle=True,
    num_workers=2,
    drop_last=True
)

EPOCHS=3

model.train()
for epoch in range(EPOCHS):

    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)

        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad() # make sure to keep only necessary grads
        loss.backward() # calculate
        optimizer.step() # update

        print(
            f"Epoch {epoch:03d}/{EPOCHS:03d}"
            f"| Batch {batch_idx:03d}/{len(train_loader)} "
            f"| Loss {loss:.2f}"
        )

model.eval()

    
    
# x, y = train_data
# x

# features = torch.tensor([
#     [0.1, 0.5],
#     [0.1, 0.5]
# ])
# print(features.shape)
# logits = model(features)
# print(logits)

Epoch 000/003| Batch 000/3 | Loss 0.70
Epoch 000/003| Batch 001/3 | Loss 11.59
Epoch 000/003| Batch 002/3 | Loss 14.90
Epoch 001/003| Batch 000/3 | Loss 0.06
Epoch 001/003| Batch 001/3 | Loss 1.03
Epoch 001/003| Batch 002/3 | Loss 0.01
Epoch 002/003| Batch 000/3 | Loss 0.00
Epoch 002/003| Batch 001/3 | Loss 0.01
Epoch 002/003| Batch 002/3 | Loss 0.15


SimpleNetwork_v2(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)

In [None]:

model

SimpleNetwork_v2(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)

In [None]:
temp_tensor_1 = torch.tensor([0.2, 0.1])

second_tensor = torch.tensor([
    [ 0.3264, -0.3537],
    [ 1.8532, -2.0992],
    [-0.8408,  0.2854],
    [-0.6927,  1.6045],
    [-0.4624,  0.9186],
    [-0.2607, -1.0090],
    [-0.3740, -0.2591],
    [ 0.1091, -0.4227],
    [-0.7840, -0.4685],
    [-1.3080, -0.5980]
])

print(temp_tensor_1 @ second_tensor.T + bias)

tensor([ 0.0299,  0.1607, -0.1396,  0.0219, -0.0006, -0.1530, -0.1007, -0.0205,
        -0.2036, -0.3214])


In [32]:
print(l1.weight)

Parameter containing:
tensor([[ 0.3264, -0.3537],
        [ 1.8532, -2.0992],
        [-0.8408,  0.2854],
        [-0.6927,  1.6045],
        [-0.4624,  0.9186],
        [-0.2607, -1.0090],
        [-0.3740, -0.2591],
        [ 0.1091, -0.4227],
        [-0.7840, -0.4685],
        [-1.3080, -0.5980]], requires_grad=True)


In [29]:
model.layers[0].bias

Parameter containing:
tensor([-2.2059,  1.8140, -0.2335,  0.0586, -1.9925, -1.8767, -0.4438, -2.2289,
        -1.7078, -1.1381], requires_grad=True)

In [30]:
print(temp_tensor_1 @ second_tensor.T)

tensor([ 0.0299,  0.1607, -0.1396,  0.0219, -0.0006, -0.1530, -0.1007, -0.0205,
        -0.2036, -0.3214])


In [31]:
print(l1_out)

tensor([-2.1759,  1.9747, -0.3731,  0.0805, -1.9931, -2.0298, -0.5446, -2.2493,
        -1.9114, -1.4595], grad_fn=<ViewBackward0>)


In [33]:
l1

Linear(in_features=2, out_features=10, bias=True)

In [24]:
temp_tensor_1 = torch.tensor([0.2, 0.1])


l1 = model.layers[0]
print("1 Layer", l1)
print(l1.weight)
print('\n--------------------------\n')
l1_out = l1(temp_tensor_1)
print(l1_out)
relu = model.layers[1]
relu(l1_out)

1 Layer Linear(in_features=2, out_features=10, bias=True)
Parameter containing:
tensor([[ 0.3264, -0.3537],
        [ 1.8532, -2.0992],
        [-0.8408,  0.2854],
        [-0.6927,  1.6045],
        [-0.4624,  0.9186],
        [-0.2607, -1.0090],
        [-0.3740, -0.2591],
        [ 0.1091, -0.4227],
        [-0.7840, -0.4685],
        [-1.3080, -0.5980]], requires_grad=True)

--------------------------

tensor([-2.1759,  1.9747, -0.3731,  0.0805, -1.9931, -2.0298, -0.5446, -2.2493,
        -1.9114, -1.4595], grad_fn=<ViewBackward0>)


tensor([0.0000, 1.9747, 0.0000, 0.0805, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], grad_fn=<ReluBackward0>)

In [None]:
temp_1 = torch.tensor([0.2, -1.2])


logits = model()

SimpleNetwork_v2(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)

In [45]:
model_g.language_model.layers[2]

Gemma3DecoderLayer(
  (self_attn): Gemma3Attention(
    (q_proj): Linear(in_features=2560, out_features=2048, bias=False)
    (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
    (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
    (o_proj): Linear(in_features=2048, out_features=2560, bias=False)
    (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
    (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
  )
  (mlp): Gemma3MLP(
    (gate_proj): Linear(in_features=2560, out_features=10240, bias=False)
    (up_proj): Linear(in_features=2560, out_features=10240, bias=False)
    (down_proj): Linear(in_features=10240, out_features=2560, bias=False)
    (act_fn): PytorchGELUTanh()
  )
  (input_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
  (post_attention_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
  (pre_feedforward_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
  (post_feedforward_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
)

In [None]:
features()
labels
logits = model(features)

loss = F.cross_entropy(logits, labels)
optimizer.zero_grad() # make sure to keep only necessary grads
loss.backward() # calculate
optimizer.step() # update

In [None]:
from transformers import AutoModelForCausalLM

model_g = AutoModelForCausalLM.from_pretrained("google/gemma-3-4b-it")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards lol: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]


tensor([0.0000, 0.0000, 0.0000, 0.0000, 6.1021, 5.4498, 0.0000, 5.9007, 0.0000,
        0.0000], grad_fn=<ReluBackward0>)

In [None]:
ef = examples_features = torch.tensor([2.0, -0.5])
mL1 = model_weights_0L = model.layers[0].weight
wL1 = model.layers[0].bias
out_linear = ef @ mL1.T + wL1
print(out_linear) # -> Formula inside the model is (x @ w.T + b)
ra = torch.relu(out_linear)
print(ra)

# model.layers[1](out_linear)
# mL2 = model.layers[2]
# wL2 = 



tensor([-0.4227, -1.1018, -0.1979, -4.0171,  6.1021,  5.4498, -4.9131,  5.9007,
        -2.7470, -0.0115], grad_fn=<AddBackward0>)
tensor([0.0000, 0.0000, 0.0000, 0.0000, 6.1021, 5.4498, 0.0000, 5.9007, 0.0000,
        0.0000], grad_fn=<ReluBackward0>)


In [49]:
model

SimpleNetwork_v2(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)

In [48]:
# weights_before update
model.layers[0].weight

Parameter containing:
tensor([[ 0.4471, -0.4818],
        [ 1.8846, -2.1429],
        [-0.9112,  0.3652],
        [-0.7811,  1.7149],
        [-0.5624,  0.9905],
        [-0.3094, -1.0653],
        [-0.3553, -0.2462],
        [ 0.2469, -0.5475],
        [-0.8266, -0.5866],
        [-1.3562, -0.6384]], requires_grad=True)

In [53]:
print(logits)

tensor([[-0.1611, -0.0617]], grad_fn=<AddmmBackward0>)


In [None]:
logtis = [-0.1611, -0.0617]
label = 0
NLL = 0.1611 + log(exp())

In [60]:
logits[0][0]

tensor(-0.1611, grad_fn=<SelectBackward0>)

In [71]:
w_1 = torch.exp(logits)
print(w_1)

tensor([[0.8512, 0.9402]], grad_fn=<ExpBackward0>)


In [75]:
tens_t = torch.tensor([0.47515909])

In [77]:
neg_log = -torch.log(tens_t)
neg_log

tensor([0.7441])

In [72]:
sum

tensor(1.7914, grad_fn=<SumBackward0>)

In [63]:
w_1 = torch.exp(logits)
print(w_1)
sum = torch.sum(w_1)
sum

log_p = torch.log(sum)
NLL1 = (0.1611 + 0.5830)
NLL1

tensor([[0.8512, 0.9402]], grad_fn=<ExpBackward0>)


0.7441

In [51]:
model = SimpleNetwork_v2(num_inputs=2, num_outputs=2)

In [70]:
model.layers[0].weight

Parameter containing:
tensor([[-0.6059,  0.0071],
        [-0.3279, -0.4993],
        [-0.6038, -0.6656],
        [-0.1631, -0.1670],
        [ 0.0542,  0.1671],
        [-0.4651, -0.6978],
        [ 0.6613, -0.4623],
        [-0.7049,  0.0627],
        [-0.6422,  0.4389],
        [-0.6923,  0.4997]], requires_grad=True)

In [69]:
model.layers[0].weight.grad

tensor([[ 0.0027,  0.0137],
        [-0.0025, -0.0125],
        [ 0.0104,  0.0519],
        [ 0.0000,  0.0000],
        [-0.0013, -0.0066],
        [ 0.0027,  0.0135],
        [-0.0018, -0.0091],
        [-0.0025, -0.0126],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000]])

In [None]:
x = torch.tensor([[0.1, 0.5]])
labels = torch.tensor([0])
logits = model(x)

loss = F.cross_entropy(logits, labels)
print(loss)
optimizer.zero_grad()
loss.backward() # that will only create the gradients
optimizer.step() # Wi = Wi - lr * grad ||| W1 = -0.6059 - 0.5 * 0.0027
# grad defined based on the loss.
# Wi = Wi - lr * grad
# Bi = Bi - lr * grad


tensor(0.7441, grad_fn=<NllLossBackward0>)


In [41]:
grads_of_layer_1 = model.layers[0].weight.grad
bias_grads_of_layer_1 = model.layers[0].bias.grad
print(bias_grads_of_layer_1)
model.layers[0].weight

tensor([ 0.0000,  0.0000,  0.0000,  0.0000, -0.0154,  0.0000,  0.0000, -0.0204,
         0.0000, -0.0110])


Parameter containing:
tensor([[-0.5161,  0.5740],
        [ 0.0646, -0.3645],
        [-0.3658, -0.0743],
        [ 0.4098, -0.0638],
        [ 0.4829,  0.0620],
        [ 0.0731, -0.5700],
        [-0.5840,  0.4129],
        [ 0.1945,  0.1713],
        [-0.1907, -0.0924],
        [ 0.3174, -0.3570]], requires_grad=True)

In [35]:
x = torch.tensor([[1.0, 2.0, 3.0],])
x_2 = torch.tensor([[1.0, 2.0, 3.0],
                  [1.0, 2.0, 3.0]])
# x_t = torch.tensor([[-0.5295,  0.6235, ])
print(x)
print(x.T)
print(x)
print(x @ x_2.T)

tensor([[1., 2., 3.]])
tensor([[1.],
        [2.],
        [3.]])
tensor([[1., 2., 3.]])
tensor([[14., 14.]])


In [None]:
x = torch.tensor([[0.1, 0.5]])
# значит что мы умноажем 0.1 и 0.5 на каждый из этих вектров.
x_t = torch.tensor([[-0.5295,  0.6235], # -0.05295 + 0.31175
        [-0.4754, -0.4820],
        [ 0.7037,  0.3126],
        [-0.5638,  0.6699],
        [-0.1481,  0.1003],
        [-0.1595, -0.3440],
        [-0.0048,  0.4915],
        [-0.4054,  0.5695],
        [-0.7043,  0.5585],
        [-0.5226,  0.4490]])
print(x)
# print(x_t)
print(x_t.T)
print(x @ x_t.T)

tensor([[0.1000, 0.5000]])
tensor([[-0.5295, -0.4754,  0.7037, -0.5638, -0.1481, -0.1595, -0.0048, -0.4054,
         -0.7043, -0.5226],
        [ 0.6235, -0.4820,  0.3126,  0.6699,  0.1003, -0.3440,  0.4915,  0.5695,
          0.5585,  0.4490]])
tensor([[ 0.2588, -0.2885,  0.2267,  0.2786,  0.0353, -0.1880,  0.2453,  0.2442,
          0.2088,  0.1722]])


In [28]:
x = torch.tensor([[0.1, 0.5]])
x_t = torch.tensor([[-0.5295,  0.6235],
        [-0.4754, -0.4820],
        [ 0.7037,  0.3126],
        [-0.5638,  0.6699],
        [-0.1481,  0.1003],
        [-0.1595, -0.3440],
        [-0.0048,  0.4915],
        [-0.4054,  0.5695],
        [-0.7043,  0.5585],
        [-0.5226,  0.4490]])
print(x)
print(x_t)
print(x_t.T)
print(x @ x_t.T)

tensor([[0.1000, 0.5000]])
tensor([[-0.5295,  0.6235],
        [-0.4754, -0.4820],
        [ 0.7037,  0.3126],
        [-0.5638,  0.6699],
        [-0.1481,  0.1003],
        [-0.1595, -0.3440],
        [-0.0048,  0.4915],
        [-0.4054,  0.5695],
        [-0.7043,  0.5585],
        [-0.5226,  0.4490]])
tensor([[-0.5295, -0.4754,  0.7037, -0.5638, -0.1481, -0.1595, -0.0048, -0.4054,
         -0.7043, -0.5226],
        [ 0.6235, -0.4820,  0.3126,  0.6699,  0.1003, -0.3440,  0.4915,  0.5695,
          0.5585,  0.4490]])
tensor([[ 0.2588, -0.2885,  0.2267,  0.2786,  0.0353, -0.1880,  0.2453,  0.2442,
          0.2088,  0.1722]])


In [36]:
x = torch.tensor([[0.1, 0.5]])  # (1, 2)
print(x)
weight = model.layers[0].weight  # (10, 2)
print(weight)
bias = model.layers[0].bias      # (10,)
print("bais", bias)

output = x @ weight.T + bias  # (1, 10)

print("out",output)

tensor([[0.1000, 0.5000]])
Parameter containing:
tensor([[-0.5295,  0.6235],
        [-0.4754, -0.4820],
        [ 0.7037,  0.3126],
        [-0.5638,  0.6699],
        [-0.1481,  0.1003],
        [-0.1595, -0.3440],
        [-0.0048,  0.4915],
        [-0.4054,  0.5695],
        [-0.7043,  0.5585],
        [-0.5226,  0.4490]], requires_grad=True)
bais Parameter containing:
tensor([ 0.2068, -0.7005,  0.6329,  0.2891,  0.6309,  0.5242, -0.6127,  0.3601,
        -0.0094, -0.5742], requires_grad=True)
out tensor([[ 0.4656, -0.9891,  0.8596,  0.5677,  0.6662,  0.3362, -0.3674,  0.6044,
          0.1994, -0.4019]], grad_fn=<AddBackward0>)


## LLM from scratch

In [1]:
from transformers import AutoModelForCausalLM

model_g = AutoModelForCausalLM.from_pretrained("google/gemma-3-4b-it")
model_g


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.35it/s]


Gemma3ForConditionalGeneration(
  (model): Gemma3Model(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(4096, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              )
              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwi

In [1]:
from transformers import AutoTokenizer 
import torch
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
tokenizer.vocab_size

data = [
    "Hello",
    "Anime",
    "World"
]

for i in range(len(data)):
    encoded = tokenizer.encode(data[i])
    print(encoded)
    decoded = tokenizer.decode(encoded)
    print(decoded)

print(data[1])

from torch.optim import Optimizer

# class MyCustomOptimizer(Optimizer):
#     def __init__(self, params, lr=1e-3):
#         defaults = dict(lr=lr)
#         super().__init__(params, defaults)

#     @torch.no_grad()
#     def step(self, closure=None):
#         for group in self.param_groups:
#             for p in group['params']:
#                 if p.grad is not None:
#                     p -= group['lr'] * p.grad

# optimizer_2 = MyCustomOptimizer

  from .autonotebook import tqdm as notebook_tqdm


[2, 9259]
<bos>Hello
[2, 175511]
<bos>Anime
[2, 14447]
<bos>World
Anime


In [3]:
tokenizer.vocab_size

262144

In [10]:
x = torch.randn(3, 4)
print(x)
def _norm(x):
    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-6)

out = _norm(x)
print(out)

tensor([[ 0.7060, -1.5624, -0.8536,  0.8454],
        [ 1.7088,  0.8094, -1.7990,  0.9332],
        [-0.2012, -0.4912, -1.3939,  1.2696]])
tensor([[ 0.6744, -1.4926, -0.8155,  0.8077],
        [ 1.2330,  0.5841, -1.2981,  0.6733],
        [-0.2054, -0.5016, -1.4232,  1.2964]])


In [None]:

import torch.nn as nn

weight = nn.Parameter(torch.zeros(256))

In [8]:
weight.shape

torch.Size([256])

## Main stuff 🩵

In [None]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

from transformers import Gemma3ForCausalLM
from transformers import PreTrainedModel, PretrainedConfig

from torch.utils.data import Dataset, DataLoader

# any model is:
# PretrainedConfig
# PreTrainedModel
# Blocks:
#   - Attention
#   - MLP 
#   - RMSNorm
#   - Rotary Embedding


class KitanConfig(PretrainedConfig):
    model_type = "kitan"

    def __init__(
            self,
            vocab_size=262144,
            hidden_size=,
            num_hidden_layers=,
            num_attention_heads=,
            num_key_value_heads=,
            intermediate_size=,
            max_position_embeddings=131_072,
            # head_dim=256 none for the 4B model
            attention_bias=False
            **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.max_position_embeddings = max_position_embeddings


class KitanEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

    def forward(self, input_ids):
        return self.embed_tokens(input_ids)
    
class KitanRMSNorm(nn.Module):
        def __init__(self, dim: int, eps: float = 1e-6):
             super().__init__()
             self.eps = eps
             self.weight = nn.Parameter(torch.zeros(dim))
        
        def _norm(self, x):
             return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
    
class KitanAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.config = config
        self.q_proj = nn.Linear(
            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(
            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
        )
        #
        #

        self.q_norm = 
        self.k_norm = 


model_ki = KitanModel(
     
)

In [3]:
from transformers import Gemma3ForConditionalGeneration, Gemma3ForCausalLM

model_g = Gemma3ForCausalLM.from_pretrained("google/gemma-3-4b-it")
model_g

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.54it/s]


Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-33): 34 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=2560, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2560, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=2560, out_features=10240, bias=False)
          (up_proj): Linear(in_features=2560, out_features=10240, bias=False)
          (down_proj): Linear(in_features=10240, out_features=2560, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNor

In [4]:
from transformers import SmolLM3ForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!pip show transformers

Name: transformers
Version: 4.53.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/vladimir_albrekht/miniconda3/lib/python3.13/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [2]:
!pip install -U transformers



Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.4
    Uninstalling transformers-4.52.4:
      Successfully uninstalled transformers-4.52.4
Successfully installed transformers-4.53.2


In [4]:
num_params = sum(p.numel() for p in model_g.parameters())
print(f"Total parameters: {num_params:,}")

Total parameters: 3,880,263,168


In [None]:
# (language_model): Gemma3TextModel(
#     (embed_tokens): Gemma3TextScaledWordEmbedding(262208, 2560, padding_idx=0)
#     (layers): ModuleList(
#     (0-33): 34 x Gemma3DecoderLayer(
#         (self_attn): Gemma3Attention(
#         (q_proj): Linear(in_features=2560, out_features=2048, bias=False)
#         (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
#         (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
#         (o_proj): Linear(in_features=2048, out_features=2560, bias=False)
#         (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
#         (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
#         )
#         (mlp): Gemma3MLP(
#         (gate_proj): Linear(in_features=2560, out_features=10240, bias=False)
#         (up_proj): Linear(in_features=2560, out_features=10240, bias=False)
#         (down_proj): Linear(in_features=10240, out_features=2560, bias=False)
#         (act_fn): PytorchGELUTanh()
#         )
#         (input_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
#         (post_attention_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
#         (pre_feedforward_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
#         (post_feedforward_layernorm): Gemma3RMSNorm((2560,), eps=1e-06)
#     )
#     )
#     (norm): Gemma3RMSNorm((2560,), eps=1e-06)
#     (rotary_emb): Gemma3RotaryEmbedding()
#     (rotary_emb_local): Gemma3RotaryEmbedding()
# )

### Your mom will be proud of you 🐈‍⬛

tensor([[1.0000, 4.2000],
        [2.0000, 1.2000]])
tensor([1, 0])
tensor([1, 0])


### Notes 📓 

In [None]:
# about crossentropy loss
import torch 
import torch.nn.functional as F
logits = torch.tensor([
    [3.0, 1.0, 0.2, 0.1, 1.5, 5.2, 5.5, 1.0],  # метка = 0
])
labels = torch.tensor([2])
loss = F.cross_entropy(logits, labels, reduction='mean')
print(loss)

# something argmax

x = torch.tensor([[1.0, 4.2],
                 [2.0, 1.2]
                 ])
print(x)
print(torch.argmax(x, dim=0)) # columns

print(torch.argmax(x, dim=1)) # rows

# count params

# for i, p in enumerate(model.parameters()):
#     print(model.layers[i])
#     print(p.numel())
#     if i == len(model.layers) - 1:
#         break

def count_params(model):
    num_params = 0
    # print(model)
    for p in model.parameters():
        if p.requires_grad:
            num_params += p.numel()
    
    for i, p in enumerate(model.parameters()):
        print(model.layers[i])
        print(p.numel())
        if i == len(model.layers) - 1:
            break
    

    print(num_params)
    

count_params(model)

for p in model.layers[0].parameters():
    p.requires_grad = False

print(f"\n\nAfter freezing{'=' * 50}")
count_params(model)

# how to use super()

class Parent:
    def __init__(self):
        print("Parent __init__ called")
        self.value = 42

class Child(Parent):
    def __init__(self):
        super().__init__()
        print("Child __init__ called")

c = Child()
print(c.value)

class Parent:
    def __init__(self):
        print("Parent __init__ called")
        self.value = 42

    def print_value(self):
        print(self.value)

class Child(Parent):
    def __init__(self):
        # No super().__init__()
        self.value = 100
        print("Child __init__ called")

c = Child()
# print(c.value)
c.print_value()


# generator expression
some_text = "hello, world!"
print(sum(1 for char in some_text if char == "l"))

num_params = sum(
    print(p.numel()) for p in model.parameters() if p.requires_grad# generator expression
    # -
)
print("Total number of trainable model parameters:", num_params)