# Torch Model

In [5]:
# From transformers.models.bert.modeling_bert.BertIntermediate
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py or https://huggingface.co/transformers/v2.5.0/_modules/transformers/modeling_bert.html (find BertIntermidiate here, we have modified it a bit here)

import torch

class BertIntermediate(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.intermediate_size)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.nn.functional.gelu(hidden_states)
        return hidden_states


# And finally, the model can be rewritten using functional torch APIs to make the test pass:

In [6]:
# torch_functional_bert.py

def bert_intermediate(hidden_states, *, parameters):
    hidden_states = hidden_states @ parameters.dense.weight
    hidden_states = hidden_states + parameters.dense.bias
    hidden_states = torch.nn.functional.gelu(hidden_states)
    return hidden_states


# Following TDD (Test-Driven Development), the first step is to write a test for the model:


In [7]:
def bert_intermediate(hidden_states, *, parameters):
    hidden_states = hidden_states @ parameters.dense.weight
    hidden_states = hidden_states + parameters.dense.bias
    hidden_states = torch.nn.functional.gelu(hidden_states)
    return hidden_states

In [8]:
import pytest
import torch
import transformers

import ttnn
import sys
#import os
#current_path = os.getcwd()
#print("the current path is: ", current_path)
sys.path.append("/home/dvartanians/tt-metal-v0.41.0/tt-metal/models/experimental/functional_bert/reference")
import torch_functional_bert # implemented here: https://github.com/tenstorrent-metal/tt-metal/blob/main/models/experimental/functional_bert/reference/torch_functional_bert.py

from models.utility_functions import torch_random
from tests.ttnn.utils_for_testing import assert_with_pcc

@pytest.mark.parametrize("model_name", ["phiyodr/bert-large-finetuned-squad2"])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("sequence_size", [384])
def test_bert_intermediate(model_name, batch_size, sequence_size):
    torch.manual_seed(0)

    config = transformers.BertConfig.from_pretrained(model_name)
    model = transformers.models.bert.modeling_bert.BertIntermediate(config).eval()

    torch_hidden_states = torch_random((batch_size, sequence_size, config.hidden_size), -0.1, 0.1, dtype=torch.float32)
    torch_output = model(torch_hidden_states) # Golden output


    # where is this function defined? 
    # must be the following:
    from ttnn.model_preprocessing import preprocess_model_parameters
    parameters = preprocess_model_parameters(
        initialize_model=lambda: model, # Function to initialize the model
        convert_to_ttnn=lambda *_: False, # Keep the weights as torch tensors
    )

    output = torch_functional_bert.bert_intermediate(
        torch_hidden_states,
        parameters=parameters,
    )
    #output = bert_intermediate(
    #    torch_hidden_states,
    #    parameters=parameters,
    #)

    
    assert_with_pcc(torch_output, output, 0.9999)

# Step 2 - Switching to ttnn ops

In [9]:
import pytest
import torch
import transformers

import ttnn
sys.path.append("/home/dvartanians/tt-metal-v0.41.0/tt-metal/models/experimental/functional_bert/tt")
import ttnn_functional_bert

from models.utility_functions import torch_random
from tests.ttnn.utils_for_testing import assert_with_pcc

@pytest.mark.parametrize("model_name", ["phiyodr/bert-large-finetuned-squad2"])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("sequence_size", [384])
def test_bert_intermediate(device, model_name, batch_size, sequence_size):
    torch.manual_seed(0)

    config = transformers.BertConfig.from_pretrained(model_name)
    model = transformers.models.bert.modeling_bert.BertIntermediate(config).eval()

    torch_hidden_states = torch_random((batch_size, sequence_size, config.hidden_size), -0.1, 0.1)
    torch_output = model(torch_hidden_states)

    parameters = preprocess_model_parameters(
        initialize_model=lambda: model.to(torch.bfloat16),
        device=device, # Device to put the parameters on
    )

    hidden_states = ttnn.from_torch(torch_hidden_states, dtype=ttnn.bfloat16)
    hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
    hidden_states = ttnn.to_device(hidden_states, device)
    output = ttnn_functional_bert.bert_intermediate(
        hidden_states,
        parameters=parameters,
    )

    #output = bert_intermediate(
    #    hidden_states,
    #    parameters=parameters,
    #)
    output = ttnn.from_device(output)
    output = ttnn.to_layout(output, ttnn.ROW_MAJOR_LAYOUT)
    output = ttnn.to_torch(output)

    assert_with_pcc(torch_output, output.to(torch_output.dtype), 0.999)

# Then implementing the function using ttnn operations:


In [10]:
# ttnn_functional_bert.py

import ttnn

def bert_intermediate(
    hidden_states,
    *,
    parameters,
):
    output = hidden_states @ parameters.dense.weight
    output = output + parameters.dense.bias
    output = ttnn.gelu(output)
    return output


# Step 3 - Optimizing the model

In [11]:
# ttnn_optimized_functional_bert.py

import ttnn

def bert_intermediate(
    hidden_states,
    *,
    parameters,
    num_cores_x,
):
    batch_size, *_ = hidden_states.shape

    num_cores_x = 12
    output = ttnn.linear(
        hidden_states,
        ff1_weight,
        bias=ff1_bias,
        memory_config=ttnn.L1_MEMORY_CONFIG, # Put the output into local core memory
        core_grid=(batch_size, num_cores_x), # Specify manual core grid to get the best possible performance
        activation="gelu", # Fuse Gelu
    )
    return True

# More examples

# Conv example

In [12]:
import torch
import torch.nn.functional as F

In [13]:
# Conv functional in pytorch
def convolution_layer(input_tensor, weight, bias, kernel_size=3, stride=1, padding=0):
    # Check if the input tensor and weight have compatible shapes
    if input_tensor.size(1) != weight.size(1):
        raise ValueError("Input tensor and weight should have the same number of input channels")
    # Perform the convolution with bias and specified kernel size
    conv_result = F.conv2d(input_tensor, weight, bias=bias, stride=stride, padding=padding)
    return conv_result

In [14]:
# Example usage
batch_size = 4
input_channels = 3
output_channels = 16
input_height = 1056
input_width = 160
# Initialize input tensor, weight, and bias
input_tensor = torch.randn((batch_size, input_channels, input_height, input_width))
#kernel_size = 3
weight = torch.randn((output_channels, input_channels, 3, 3))
bias = torch.randn((output_channels,))
# Specify kernel size
# Apply convolution layer with bias and kernel size
conv_result = convolution_layer(input_tensor, weight, bias, stride=1, padding=1)
# Print the result shape
print("Convolution Result Shape:", conv_result.shape)


Convolution Result Shape: torch.Size([4, 16, 1056, 160])


In [15]:
# Switching to TTNN ops
import ttnn
device_id = 0
device = ttnn.open(device_id)
# Will need to add the following for CNN model parameters too!
#from ttnn.model_preprocessing import preprocess_model_parameters

input_tensor = ttnn.from_torch(input_tensor, dtype=ttnn.bfloat16)
weight = ttnn.from_torch(weight, dtype=ttnn.bfloat16)
bias = ttnn.from_torch(bias, dtype=ttnn.bfloat16)

input_tensor = ttnn.to_layout(input_tensor, ttnn.TILE_LAYOUT)
#weight = ttnn.to_layout(weight, ttnn.TILE_LAYOUT)

input_tensor = ttnn.to_device(input_tensor, device)
#weight = ttnn.to_device(weight, device)




[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening user mode device driver
[32m2024-01-23 20:20:58.551[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 1 PCI device : {0}
[32m2024-01-23 20:20:58.596[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (logical_device_id: 0 pci_interface_id: 0 device_id: 0xfaca revision: 0)
[32m2024-01-23 20:20:58.719[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Disable PCIE DMA
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | AI CLK for device 0 is:   1202 MHz


In [19]:
# define and Apply TTNN conv function here
import sys
sys.path.append("/home/dvartanians/tt-metal-v0.41.0/tt-metal/models/experimental/functional_unet/reference")
from conv import Conv2D

conv_layer = Conv2D(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=0, bias=bias, weight=weight)
print("\n\n\n\n\nconv_layer: ", conv_layer)
def convolution_layer_ttnn(input_tensor, weight, bias, kernel_size=3, stride=1, padding=0):
    # Check if the input tensor and weight have compatible shapes
    if input_tensor.size(1) != weight.size(1):
        raise ValueError("Input tensor and weight should have the same number of input channels")
    # Perform the convolution with bias and specified kernel size
    conv_result_ttnn = conv_layer(input_tensor)
    return conv_result_ttnn

conv_result_ttnn = ttnn.from_device(conv_result_ttnn)
conv_result_ttnn = ttnn.to_layout(conv_result_ttnn, ttnn.ROW_MAJOR_LAYOUT)
conv_result_ttnn = ttnn.to_torch(conv_result_ttnn)

assert_with_pcc(conv_result, conv_result_ttnn.to(torch_output.dtype), 0.999)

ModuleNotFoundError: No module named 'tt_eager.tt_dnn.op_library.sliding_window_op_infra.tt_py_composite_conv'