In [13]:
import ncd
from ncd import shape

## Dot-Product Attention
Using *Neural Circuit Diagrams* we can represent dot-product attention by;

<img src="Graphics/attention.png" width="700">

Each vertical section of a diagram corresponds to a shape. Columns either represent the data type we are working with, or an operation between data types. Solid lines represent axes and dashed lines separating terms represent Cartesian products. Placing a solid line adjacent to an operation, without separation, lifts it. Wirings represent Einstein operations, which includes linear contractions and rearrangements.

In [14]:
def display_columns(target: ncd.Shape, name = None):
    columns = '\n'.join([
        f'Domain:   {target.dom}',
        *(f'Morphism: {x}\n' + 
          f'Object:   {x.cod}' 
        for x in ncd.Composed.get_content(target))
    ])
    if name:
        columns = name + '\n' + columns
    return columns

In [15]:
from ncd import Duplicate
from ncd.nn import Einops, Linear, Addition, SoftMax
from itertools import starmap

L = Linear

x = shape('x')
m = shape('m^')
# Copying is implicit
linears = x >> (m @ (L('q') + L('k') + L('v')) @ 'k^')
# The axis names for einops are simply used as tags, which are attached
# to configurations.
ein1 = Einops('y k, x k -> y x')
softmax = x >> SoftMax()
ein2 = Einops('y x, x k -> y k')
linOut = L('o') @ m

# Note, the printout will contain tagged axes. These are configured
# upon composition.
section_names = ['Linears', 'Einops', 'SoftMax', 'Einops', 'Linear']
sections = [linears, ein1, softmax, ein2, linOut]
print("Printout of Individual Sections;")
print('\n'.join(starmap(display_columns, zip(sections, section_names))))

Printout of Individual Sections;
Linears
Domain:   [[36m[4mx[0m [36m[4mm[0m→]
Morphism: [[36m[4mx[0m [36m[4mm[0m→Δ3]
Object:   ([[36m[4mx[0m [36m[4mm[0m→], [[36m[4mx[0m [36m[4mm[0m→], [[36m[4mx[0m [36m[4mm[0m→])
Morphism: ([[36m[4mx[0m→Lq], [[36m[4mx[0m→Lk], [[36m[4mx[0m→Lv])
Object:   ([[36m[4mx[0m [36m[4mk[0m→], [[36m[4mx[0m [36m[4mk[0m→], [[36m[4mx[0m [36m[4mk[0m→])
Einops
Domain:   ([[34my[0m=[33my.3B[0m [34mk[0m=[33mk.1B[0m→], [[34mx[0m=[33mx.14[0m [34mk[0m=[33mk.1B[0m→])
Morphism: [32m[4my k, x k -> y x[0m
Object:   [[34my[0m=[33my.3B[0m [34mx[0m=[33mx.14[0m→]
SoftMax
Domain:   [[36m[4mx[0m [34m*[0m=[33m*.85[0m→]
Morphism: [[36m[4mx[0m→[32m[4m◁[0m]
Object:   [[36m[4mx[0m [34m*[0m=[33m*.85[0m→]
Einops
Domain:   ([[34my[0m=[33my.88[0m [34mx[0m=[33mx.77[0m→], [[34mx[0m=[33mx.77[0m [34mk[0m=[33mk.E1[0m→])
Morphism: [32m[4my x, x k -> y k[0m
Object:   [[34my[0m=

In [16]:
attention = linears @ (ein1 @ softmax + '*') @ ein2 @ linOut
print("\nPrintout of Composed Expression;")
print(display_columns(attention))


Printout of Composed Expression;
Domain:   [[36m[4mx[0m [36m[4mm[0m→]
Morphism: [[36m[4mx[0m [36m[4mm[0m→Δ3]
Object:   ([[36m[4mx[0m [36m[4mm[0m→], [[36m[4mx[0m [36m[4mm[0m→], [[36m[4mx[0m [36m[4mm[0m→])
Morphism: ([[36m[4mx[0m→Lq], [[36m[4mx[0m→Lk], [[36m[4mx[0m→Lv])
Object:   ([[36m[4mx[0m [36m[4mk[0m→], [[36m[4mx[0m [36m[4mk[0m→], [[36m[4mx[0m [36m[4mk[0m→])
Morphism: ([32m[4my k, x k -> y x[0m, [[36m[4mx[0m [36m[4mk[0m→])
Object:   ([[36m[4mx[0m [36m[4mx[0m→], [[36m[4mx[0m [36m[4mk[0m→])
Morphism: ([[36m[4mx[0m→[32m[4m◁[0m], [[36m[4mx[0m [36m[4mk[0m→])
Object:   ([[36m[4mx[0m [36m[4mx[0m→], [[36m[4mx[0m [36m[4mk[0m→])
Morphism: [32m[4my x, x k -> y k[0m
Object:   [[36m[4mx[0m [36m[4mk[0m→]
Morphism: Lo
Object:   [[36m[4mm[0m→]


In [17]:
# We can use the marches package to disassemble an algebraic expression into
#   a graph, and to then compile it into code. Currently, PyTorch is supported.
import ncd.marches

# We use a functor which remaps objects to make them configurable.
make_configurable = ncd.DictFunctor({shape(a): ncd.Conf(a) for
    a in ['x', 'm', 'k']})

# See if our functor worked ie succesfully mapped set objects to
# configurable objects.
print(display_columns(make_configurable(attention)))

Domain:   [[34mx[0m=[33mx.60[0m [34mm[0m=[33mm.88[0m→]
Morphism: [[34mx[0m=[33mx.60[0m [34mm[0m=[33mm.88[0m→Δ3]
Object:   ([[34mx[0m=[33mx.60[0m [34mm[0m=[33mm.88[0m→], [[34mx[0m=[33mx.60[0m [34mm[0m=[33mm.88[0m→], [[34mx[0m=[33mx.60[0m [34mm[0m=[33mm.88[0m→])
Morphism: ([[34mx[0m=[33mx.60[0m→Lq], [[34mx[0m=[33mx.60[0m→Lk], [[34mx[0m=[33mx.60[0m→Lv])
Object:   ([[34mx[0m=[33mx.60[0m [34mk[0m=[33mk.20[0m→], [[34mx[0m=[33mx.60[0m [34mk[0m=[33mk.20[0m→], [[34mx[0m=[33mx.60[0m [34mk[0m=[33mk.20[0m→])
Morphism: ([32m[4my k, x k -> y x[0m, [[34mx[0m=[33mx.60[0m [34mk[0m=[33mk.20[0m→])
Object:   ([[34mx[0m=[33mx.60[0m [34mx[0m=[33mx.60[0m→], [[34mx[0m=[33mx.60[0m [34mk[0m=[33mk.20[0m→])
Morphism: ([[34mx[0m=[33mx.60[0m→[32m[4m◁[0m], [[34mx[0m=[33mx.60[0m [34mk[0m=[33mk.20[0m→])
Object:   ([[34mx[0m=[33mx.60[0m [34mx[0m=[33mx.60[0m→], [[34mx[0m=[33mx.60[0m [34mk[0

In [18]:
# It did! So we can compile it, with a correct __init__ function.
# 'Multilinear' is found in ncd.torch_utilities
print(ncd.marches.to_torch(make_configurable(attention), "Attention"))

class Attention(nn.Module):
    def __init__(self, k, m, x):
        self.Lq = Multilinear((m),(k))
        self.Lk = Multilinear((m),(k))
        self.Lv = Multilinear((m),(k))
        self.Lo = Multilinear((x, k),(m))
    def forward(self, a):
        a, b, c = a, a, a
        a = Lq(a)
        b = Lk(b)
        c = Lv(c)
        a = einops.einsum(a, b, "y k, x k -> y x")
        a = torch.softmax(a, dim=-1)
        a = einops.einsum(a, c, "y x, x k -> y k")
        a = Lo(a)
        return a


## Multi-Head Dot Product Attention
We represent the more intricate multi-head dot-product attention by;

<img src="Graphics/multihead.png" width="700">

Compared to single-head dot product attention above, this diagram has an additional $h$ axis. The linear layers output data of size ``k h``, there is additional wiring for the Einops, and the SoftMax is lifted below. We can implement these changes using our algebraic tools.

In [19]:
# We will first construct Multi-Head Attention without configurables
# (as above), then show a construction using configurables all the
# way through.

from ncd import Duplicate, shape
from ncd.nn import Einops, Linear, Addition, SoftMax

L = Linear
# The lower-most axis represents R^m
m = shape('m^')
# x represents the number of tokens. It is an integer.
x = shape('x')
# Addition assembles shapes into tuples.
linears = x >> (m @ (L('q') + L('k') + L('v')) @ 'k h')
# The first Einops contracts the k dimension.
# We tuple it with a generic shape '*' to leave the values
# tuple segment unchanged.
einops = Einops('q k h, x k h -> q x h')
softmax = x >> SoftMax() << 'h'
einops2 = Einops('q x h, x k h -> q k h')
linout = shape('k h') @ L('o') @ 'm'

multihead = linears @ (einops @ softmax + '*') @ einops2 @ linout
print(display_columns(multihead))

Domain:   [[36m[4mx[0m [36m[4mm[0m→]
Morphism: [[36m[4mx[0m [36m[4mm[0m→Δ3]
Object:   ([[36m[4mx[0m [36m[4mm[0m→], [[36m[4mx[0m [36m[4mm[0m→], [[36m[4mx[0m [36m[4mm[0m→])
Morphism: ([[36m[4mx[0m→Lq], [[36m[4mx[0m→Lk], [[36m[4mx[0m→Lv])
Object:   ([[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→], [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→], [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→])
Morphism: ([32m[4mq k h, x k h -> q x h[0m, [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→])
Object:   ([[36m[4mx[0m [36m[4mx[0m [36m[4mh[0m→], [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→])
Morphism: ([[36m[4mx[0m→[32m[4m◁[0m←[36m[4mh[0m], [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→])
Object:   ([[36m[4mx[0m [36m[4mx[0m [36m[4mh[0m→], [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→])
Morphism: [32m[4mq x h, x k h -> q k h[0m
Object:   [[36m[4mx[0m [36m[4mk[0m [36m[4mh[0m→]
Morphism: [[36m[4mx[0m→Lo]
Object:   [

In [20]:
make_configurable = ncd.DictFunctor({shape(a): ncd.Conf(a) for
    a in ['x', 'm', 'k', 'h']})
import ncd.marches
print(ncd.marches.to_torch(make_configurable(multihead), "MultiHeadAttention"))

class MultiHeadAttention(nn.Module):
    def __init__(self, h, k, m, x):
        self.Lq = Multilinear((m),(k, h))
        self.Lk = Multilinear((m),(k, h))
        self.Lv = Multilinear((m),(k, h))
        self.Lo = Multilinear((k, h),m)
    def forward(self, a):
        a, b, c = a, a, a
        a = Lq(a)
        b = Lk(b)
        c = Lv(c)
        a = einops.einsum(a, b, "q k h, x k h -> q x h")
        a = torch.softmax(a, dim=-2)
        a = einops.einsum(a, c, "q x h, x k h -> q k h")
        a = Lo(a)
        return a


In [21]:
# We can also use configurations all the way through, and analyse the individual
# sections. This shows how the expressions we use correspond with the diagram,
# and how the composition process automatically aligns the configurable axes.

from ncd import Duplicate, shape
from ncd.nn import Einops, Linear, Addition, SoftMax

# Multi-Headed Attention defined symbolically.
# We piece together individual sections.
# Upon composition, axes sizes are aligned!
L = Linear
m = shape('*m^')
x = shape('*x')

linears = x >> (m @ (L('q') + L('k') + L('v')) @ '*k *h')
einops = (Einops('q k h, x k h -> q x h') + '*')
softs = ((x >> SoftMax() << '*') + '*') @ Einops('q x h, x k h -> q k h')
linout = (shape('*k *h') @ L('o') @ m)

# Note, the printout will contain tagged axes. These are configured
# upon composition.
section_names = ['Linears', 'Einops', 'SoftMax + Einops', 'Linear']
sections = [linears, einops, softs, linout]
print("Printout of Individual Sections;")
print('\n'.join(starmap(display_columns, zip(sections, section_names))))

Printout of Individual Sections;
Linears
Domain:   [[34mx[0m=[33mx.4C[0m [34mm[0m=[33mm.A5[0m→]
Morphism: [[34mx[0m=[33mx.4C[0m [34mm[0m=[33mm.A5[0m→Δ3]
Object:   ([[34mx[0m=[33mx.4C[0m [34mm[0m=[33mm.A5[0m→], [[34mx[0m=[33mx.4C[0m [34mm[0m=[33mm.A5[0m→], [[34mx[0m=[33mx.4C[0m [34mm[0m=[33mm.A5[0m→])
Morphism: ([[34mx[0m=[33mx.4C[0m→Lq], [[34mx[0m=[33mx.4C[0m→Lk], [[34mx[0m=[33mx.4C[0m→Lv])
Object:   ([[34mx[0m=[33mx.4C[0m [34mk[0m=[33mk.2D[0m [34mh[0m=[33mh.36[0m→], [[34mx[0m=[33mx.4C[0m [34mk[0m=[33mk.2D[0m [34mh[0m=[33mh.36[0m→], [[34mx[0m=[33mx.4C[0m [34mk[0m=[33mk.2D[0m [34mh[0m=[33mh.36[0m→])
Einops
Domain:   ([[34mq[0m=[33mq.43[0m [34mk[0m=[33mk.63[0m [34mh[0m=[33mh.6C[0m→], [[34mx[0m=[33mx.E0[0m [34mk[0m=[33mk.63[0m [34mh[0m=[33mh.6C[0m→], [34m[0m=[33m.A8[0m)
Morphism: ([32m[4mq k h, x k h -> q x h[0m, [34m[0m=[33m.A8[0m)
Object:   ([[34mq[0m=[33mq.43[0

In [22]:
multihead = linears @ einops @ softs @ (shape('*k *h') @ L('o') @ m)

print(display_columns(multihead))

Domain:   [[34mx[0m=[33mx.E0[0m [34mm[0m=[33mm.A5[0m→]
Morphism: [[34mx[0m=[33mx.E0[0m [34mm[0m=[33mm.A5[0m→Δ3]
Object:   ([[34mx[0m=[33mx.E0[0m [34mm[0m=[33mm.A5[0m→], [[34mx[0m=[33mx.E0[0m [34mm[0m=[33mm.A5[0m→], [[34mx[0m=[33mx.E0[0m [34mm[0m=[33mm.A5[0m→])
Morphism: ([[34mx[0m=[33mx.E0[0m→Lq], [[34mx[0m=[33mx.E0[0m→Lk], [[34mx[0m=[33mx.E0[0m→Lv])
Object:   ([[34mx[0m=[33mx.E0[0m [34mk[0m=[33mk.D7[0m [34mh[0m=[33mh.6C[0m→], [[34mx[0m=[33mx.E0[0m [34mk[0m=[33mk.D7[0m [34mh[0m=[33mh.6C[0m→], [[34mx[0m=[33mx.E0[0m [34mk[0m=[33mk.D7[0m [34mh[0m=[33mh.6C[0m→])
Morphism: ([32m[4mq k h, x k h -> q x h[0m, [[34mx[0m=[33mx.E0[0m [34mk[0m=[33mk.D7[0m [34mh[0m=[33mh.6C[0m→])
Object:   ([[34mq[0m=[33mx.E0[0m [34mx[0m=[33mx.E0[0m [34mh[0m=[33mh.6C[0m→], [[34mx[0m=[33mx.E0[0m [34mk[0m=[33mk.D7[0m [34mh[0m=[33mh.6C[0m→])
Morphism: ([[34mx[0m=[33mx.E0[0m→[32m[4m◁[0m←

In [23]:
# We can use the "GetConfig" functor to accumulate the
# unassigned variables in its internal state. This allows
# us to quickly generate configuration parameters from an
# expression.
config = ncd.GetConfig()
config(multihead)
print(config.configs)

{[33mx.E0[0m, [33mh.6C[0m, [33mm.A5[0m, [33mk.D7[0m}


In [24]:
# Marches is a package for compiling code.
# Here, it generate code for multi-headed attention.
import ncd.marches

print(ncd.marches.to_torch(multihead, "MultiHeadAttention"))

class MultiHeadAttention(nn.Module):
    def __init__(self, x, h, m, k):
        self.Lq = Multilinear((m),(k, h))
        self.Lk = Multilinear((m),(k, h))
        self.Lv = Multilinear((m),(k, h))
        self.Lo = Multilinear((k, h),(m))
    def forward(self, a):
        a, b, c = a, a, a
        a = Lq(a)
        b = Lk(b)
        c = Lv(c)
        a = einops.einsum(a, b, "q k h, x k h -> q x h")
        a = torch.softmax(a, dim=-2)
        a = einops.einsum(a, c, "q x h, x k h -> q k h")
        a = Lo(a)
        return a
