In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import keras
from keras import layers
torch.set_printoptions(precision=8)

root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"

In [None]:
class FordDataset(Dataset):
  def __init__(self, split="train"):
    self.root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"
    self.data = torch.tensor(np.loadtxt(self.root_url + "FordA_TRAIN.tsv", delimiter="\t"), dtype=torch.float32) if split=="train" else torch.tensor(np.loadtxt(self.root_url + "FordA_TEST.tsv", delimiter="\t"), dtype=torch.float32)
    self.labels = self.data[:, 0] # get first element from each example
    self.sequences = self.data[:, 1:] # get all elements after first element
    self.labels[self.labels == -1] = 0 # change all -1 labels to 0
    self.num_classes = len(torch.unique(self.labels)) # count the number of unique labels

  def __len__(self):
      return self.data.shape[0]

  def __getitem__(self, idx):
    sequence = torch.reshape(self.sequences[idx], (-1, 1)) # dim: seq_len x num_features
    label = torch.reshape(self.labels[idx], (-1, )) # dim: 1 x 1

    return sequence, label

train_dataset = FordDataset("train")
test_dataset = FordDataset("test")

In [None]:
embed_size=256 # size of the embeddings
num_heads=4 # number of attention heads
ff_dim=4 # dimension of the feedforward layer in the encoder
num_transformer_blocks=4 # number of encoder blocks
mlp_units=[128] # the size of the feedforward layer used to make predictions
mlp_dropout=0.4 # dropout in the feedforward layer
dropout=0.25 # dropout in the encoder

In [None]:
def keras_encoder(inputs, embed_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    x = layers.MultiHeadAttention(
        key_dim=embed_size, num_heads=num_heads, dropout=dropout
    )(inputs, inputs)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x + res



In [None]:
def pytorch_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
  embedding = nn.Linear(1, head_size)
  x = embedding(inputs)
  x = nn.MultiheadAttention(head_size, num_heads, dropout=0.0)(x, x, x)[0]
  x = nn.Linear(head_size, 1)(x)
  x = nn.Dropout(dropout)(x)
  x = nn.LayerNorm(normalized_shape=x.shape[2], eps=1e-6)(x)
  res = x + inputs

  x = nn.Conv1d(in_channels=x.shape[1], out_channels=ff_dim, kernel_size=1)(res)
  x = nn.ReLU()(x)
  x = nn.Dropout(dropout)(x)
  print(x.shape)
  x = nn.Conv1d(in_channels=x.shape[1], out_channels=inputs.shape[-1], kernel_size=1)(x)
  x = nn.LayerNorm(normalized_shape=x.shape[2], eps=1e-6)(x)


  return x + res

In [None]:
def pytorch_encoder2(inputs, head_size, num_heads, ff_dim, dropout=0):
  # input is 500x1 i.e. each element in the sequence has 1 feature. therefore in_features for nn.Linear is 1. we project this single feature to {head_size} dimensions
  embedding = nn.Linear(in_features=1, out_features=head_size)
  # input shape is (B x 500 x 1) where B is batch_size
  x = embedding(inputs) # embedding projects to (B x 500 x 256)
  x = nn.MultiheadAttention(embed_dim=head_size, num_heads=num_heads, dropout=0.0)(x, x, x)[0] # keeps (B x 500 x 256)
  x = nn.Linear(head_size, 1)(x) # projects back to (B x 500 x 1)
  x = nn.Dropout(dropout)(x) # (B x 500 x 1)
  x = nn.LayerNorm(normalized_shape=x.shape[2], eps=1e-6)(x) # (B x 500 x 1)
  res = x + inputs

  # feedforward layer projects to (B x ff_dim x 1)
  res = res.reshape(res.shape[0], res.shape[2], res.shape[1]) # (B x 1 x 500)

  x = nn.Conv1d(in_channels=res.shape[1], out_channels=ff_dim, kernel_size=1)(res) # (B x ff_dim x 500)
  x = nn.ReLU()(x) # (B x ff_dim x 500)
  x = nn.Dropout(dropout)(x) # (B x ff_dim x 500)
  x = nn.Conv1d(in_channels=x.shape[1], out_channels=inputs.shape[-1], kernel_size=1)(x) # (B x 1 x 500)
  x = nn.LayerNorm(normalized_shape=x.shape[2], eps=1e-6)(x) # (B x 1 x 500)
  x = x + res
  return x.reshape(inputs.shape[0], inputs.shape[1], inputs.shape[2]) # (B x 500 x 1)

# attention

In [None]:
import pandas as pd
import altair as alt
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR

In [None]:
def clones(module, N):
    "Produce N independent but identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
pt_attn = nn.MultiheadAttention(1, 1, dropout=0.0)

pt_attn(xp, xp, xp)[0].shape

torch.Size([1, 500, 1])

In [None]:
xp.shape

torch.Size([2, 500, 1])

In [None]:
xp_transform = nn.Linear(1, 256)(xp)

In [None]:
xp_transform.shape

torch.Size([64, 500, 256])

In [None]:
lin1 = nn.Linear(256, 256)

In [None]:
lin1(xp_transform).shape

torch.Size([2, 500, 256])

In [None]:
linear_layers = clones(lin1, 4)

In [None]:
linear_layers

ModuleList(
  (0-3): 4 x Linear(in_features=256, out_features=256, bias=True)
)

In [None]:
query, key, value = [
  lin(x)
  for lin, x in zip(linear_layers, (xp_transform, xp_transform, xp_transform))
]

In [None]:
query.shape

torch.Size([2, 500, 256])

In [None]:
key.shape

torch.Size([2, 500, 256])

In [None]:
value.shape

torch.Size([2, 500, 256])

In [None]:
def attention(query, key, value, mask=None, dropout=None):
  d_k = query.size(-1)
  scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
  if mask is not None:
      scores = scores.masked_fill(mask == 0, -1e9)
  p_attn = scores.softmax(dim=-1)
  if dropout is not None:
      p_attn = dropout(p_attn)
  return torch.matmul(p_attn, value), p_attn

In [None]:
att = attention(query, key, value)

In [None]:
att[0].shape

torch.Size([2, 500, 256])

In [None]:
final_layer = nn.Linear(4, 1)

In [None]:
linear_layers[-1](att[0]).shape

torch.Size([2, 500, 256])

In [None]:
query.shape

torch.Size([1, 500, 1])

In [None]:
key.shape

torch.Size([1, 500, 1])

In [None]:
key.transpose(-2, -1).shape

torch.Size([1, 1, 500])

In [None]:
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(key.shape[-1])
scores.shape

torch.Size([1, 500, 500])

In [None]:
scores = scores.softmax(-1)

In [None]:
att = torch.matmul(scores, value)

In [None]:
att.shape

torch.Size([1, 500, 1])

In [None]:
scores[0][0:5]

tensor([[0.13987494, 0.16932230, 0.24462357,  ..., 0.71016347, 0.70146340,
         0.66761613],
        [0.16932230, 0.20496909, 0.29612327,  ..., 0.85967153, 0.84913993,
         0.80816686],
        [0.24462357, 0.29612327, 0.42781571,  ..., 1.24198604, 1.22677076,
         1.16757619],
        [0.36374766, 0.44032609, 0.63614863,  ..., 1.84679461, 1.82416999,
         1.73614943],
        [0.51167548, 0.61939669, 0.89485574,  ..., 2.59784389, 2.56601810,
         2.44220161]], grad_fn=<SliceBackward0>)

# remaining code

In [None]:
class PtMultiheadAttention(nn.Module):
  def __init__(self, head_size, num_heads, dropout=0.1):
    super(PtMultiheadAttention, self).__init__()
    assert head_size % num_heads == 0

    self.d_k = head_size // num_heads
    self.weight_matrices = clones(nn.Linear(head_size, head_size), 4)
    self.attn = None
    if dropout > 0:
      self.dropout = nn.Dropout(dropout)

  def _attention(self, query, key, value, mask=None, dropout=None):
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
    # if mask is not None:
    #   scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    # if dropout is not None:
    #   p_attn = self.dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

  def forward(self, query, key, value):

    # get q, k and v
    query, key, value = [
      weights(inputs)
      for weights, inputs in zip(self.weight_matrices, (query, key, value))
    ]

    # calculate attention
    x, self.attn = self._attention(query, key, value)

    return self.weight_matrices[-1](x)


In [None]:
m_att = PtMultiheadAttention(256, 4)

In [None]:
m_att(xp_transform).shape

TypeError: PtMultiheadAttention.forward() missing 2 required positional arguments: 'key' and 'value'

In [None]:
class PytorchEncoder(nn.Module):
  def __init__(self, inputs, head_size, num_heads, ff_dim, dropout=0):
    super(PytorchEncoder, self).__init__()
    # attention
    self.embedding = nn.Linear(in_features=inputs.shape[-1], out_features=head_size)
    self.attention = nn.MultiheadAttention(head_size, num_heads, dropout=0.0)
    self.linear1 = nn.Linear(head_size, 1)
    self.dropout1 = nn.Dropout(dropout)
    self.layer_norm1 = nn.LayerNorm(normalized_shape=inputs.shape[-1], eps=1e-6)

    # feedforward
    self.conv1 = nn.Conv1d(in_channels=inputs.shape[1], out_channels=ff_dim, kernel_size=1)
    self.relu1 = nn.ReLU()
    self.dropout2 = nn.Dropout(dropout)
    self.conv2 = nn.Conv1d(in_channels=ff_dim, out_channels=inputs.shape[-1], kernel_size=1)
    self.layer_norm2 = nn.LayerNorm(normalized_shape=inputs.shape[2], eps=1e-6)


  def forward(self, src):
    x = self.embedding(src)
    x = self.attention(x, x, x)[0]
    x = self.linear1(x)
    x = self.dropout1(x)
    x = self.layer_norm1(x)

    res = x + src

    # res = res.reshape(res.shape[0], res.shape[2], res.shape[1])
    x = self.conv1(res)
    x = self.relu1(x)
    x = self.dropout2(x)
    x = self.conv2(x)
    x = self.layer_norm2(x)

    return x + res


In [None]:
class PytorchEncoder2(nn.Module):
  def __init__(self, inputs, embed_size, num_heads, ff_dim, dropout=0):
    super(PytorchEncoder2, self).__init__()
    # attention
    self.embedding = nn.Linear(in_features=inputs.shape[-1], out_features=embed_size)
    self.attention = nn.MultiheadAttention(embed_size, num_heads, dropout=0.0)
    self.linear1 = nn.Linear(embed_size, 1)
    self.dropout1 = nn.Dropout(dropout)
    self.layer_norm1 = nn.LayerNorm(normalized_shape=inputs.shape[-1], eps=1e-6)

    # feedforward
    self.conv1 = nn.Conv1d(in_channels=inputs.shape[-1], out_channels=ff_dim, kernel_size=1)
    self.relu1 = nn.ReLU()
    self.dropout2 = nn.Dropout(dropout)
    self.conv2 = nn.Conv1d(in_channels=ff_dim, out_channels=inputs.shape[-1], kernel_size=1)
    self.layer_norm2 = nn.LayerNorm(normalized_shape=inputs.shape[1], eps=1e-6)


  def forward(self, src):
    x = self.embedding(src)
    x = self.attention(x, x, x)[0]
    x = self.linear1(x)
    x = self.dropout1(x)
    x = self.layer_norm1(x)

    res = x + src
    res = res.reshape(res.shape[0], res.shape[2], res.shape[1])

    x = self.conv1(res)
    x = self.relu1(x)
    x = self.dropout2(x)
    x = self.conv2(x)
    x = self.layer_norm2(x)
    x = x + res

    return x.reshape(x.shape[0], x.shape[-1], x.shape[1])


In [None]:
class PytorchEncoder3(nn.Module):
  def __init__(self, inputs, embed_size, num_heads, ff_dim, dropout=0):
    super(PytorchEncoder3, self).__init__()
    # attention
    self.embedding = nn.Linear(in_features=inputs.shape[-1], out_features=embed_size)
    self.attention = PtMultiheadAttention(embed_size, num_heads, dropout=0.0)
    self.linear1 = nn.Linear(embed_size, 1)
    self.dropout1 = nn.Dropout(dropout)
    self.layer_norm1 = nn.LayerNorm(normalized_shape=inputs.shape[-1], eps=1e-6)

    # feedforward
    self.conv1 = nn.Conv1d(in_channels=inputs.shape[-1], out_channels=ff_dim, kernel_size=1)
    self.relu1 = nn.ReLU()
    self.dropout2 = nn.Dropout(dropout)
    self.conv2 = nn.Conv1d(in_channels=ff_dim, out_channels=inputs.shape[-1], kernel_size=1)
    self.layer_norm2 = nn.LayerNorm(normalized_shape=inputs.shape[1], eps=1e-6)


  def forward(self, src):
    x = self.embedding(src)
    x = self.attention(x, x, x)[0]
    x = self.linear1(x)
    x = self.dropout1(x)
    x = self.layer_norm1(x)

    res = x + src
    res = res.reshape(res.shape[0], res.shape[2], res.shape[1])

    x = self.conv1(res)
    x = self.relu1(x)
    x = self.dropout2(x)
    x = self.conv2(x)
    x = self.layer_norm2(x)
    x = x + res

    return x.reshape(x.shape[0], x.shape[-1], x.shape[1])

In [None]:
g = pytorch_encoder(xp, head_size, num_heads, ff_dim, dropout)

In [None]:
h = keras_encoder(xt, head_size, num_heads, ff_dim, dropout)

In [None]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, drop_last = True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=True, drop_last = True)

In [None]:
x,y = next(iter(train_dataloader))
xp = x
xt = np.array(x)

print(xt.shape)
print(xp.shape)

print(xt[0][0:3])
print(xp[0][0:3])

(64, 500, 1)
torch.Size([64, 500, 1])
[[-0.8186042]
 [-0.6492901]
 [-0.4733622]]
tensor([[-0.81860417],
        [-0.64929008],
        [-0.47336221]])


In [None]:
from collections import OrderedDict

class EncoderClassifier(nn.Module):
  def __init__(self, inputs, embed_size, num_heads, ff_dim, dropout=0, num_blocks=4):
    super(EncoderClassifier, self).__init__()
    encoder_layer = PytorchEncoder2(inputs=inputs, embed_size=embed_size, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout)
    encoders = OrderedDict()
    for idx in range(num_blocks):
      encoders[f"encoder{idx}"] = encoder_layer
    self.encoder_block = nn.Sequential(encoders)
    self.avg = nn.AvgPool1d(kernel_size=1)
    self.dense1 = nn.Linear(500, mlp_units[0])
    self.relu1 = nn.ReLU()
    self.dropout1 = nn.Dropout(dropout)
    self.dense2 = nn.Linear(mlp_units[0], 2)
    self.softmax = nn.Softmax()

  def forward(self, x):
    x = self.encoder_block(x)
    x = torch.squeeze(self.avg(x), 2)
    x = self.dense1(x)
    x = self.relu1(x)
    x = self.dropout1(x)
    x = self.dense2(x)
    x = self.softmax(x)
    return x

In [None]:
cuda0 = torch.device('cuda:0')
model = EncoderClassifier(inputs=xp, embed_size=embed_size, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout)
model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
len(train_dataloader)

56

In [None]:
def train_one_epoch(epoch_index, model, criterion, optimizer):
    running_loss = 0.
    last_loss = 0.
    correct = 0
    iterations = 0
    for i, data in enumerate(train_dataloader):
      inputs, labels = data
      optimizer.zero_grad()
      if torch.cuda.is_available():
        inputs = inputs.cuda()
        labels = labels.cuda()
      outputs = model(inputs)
      loss = criterion(outputs, labels.to(torch.long).reshape(-1))
      loss.backward()
      optimizer.step()
      running_loss += loss.item()

      predictions = torch.argmax(outputs, axis=1)
      correct_labels = labels.squeeze()

      correct += (predictions == correct_labels).int().sum()/len(labels) * 100
      iterations += 1
    last_loss = running_loss / len(train_dataloader)
    acc = (correct / iterations)

    return last_loss, acc

In [None]:
from torch.utils.tensorboard import SummaryWriter

In [None]:
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/encoder_trainer_{}'.format(timestamp))

In [None]:
epochs = 150

for epoch in range(epochs):
    print('EPOCH {}:'.format(epoch + 1))

    model.train(True)
    avg_loss, acc = train_one_epoch(epoch, model, criterion, optimizer)

    print(avg_loss)
    print(acc)

EPOCH 1:
0.7078477931874139
tensor(52.81808090, device='cuda:0')
EPOCH 2:
0.6663750793252673
tensor(59.20759201, device='cuda:0')
EPOCH 3:
0.6341418836797986
tensor(64.53683472, device='cuda:0')
EPOCH 4:
0.6114352707351957
tensor(68.33147430, device='cuda:0')
EPOCH 5:
0.5948727205395699
tensor(69.89397430, device='cuda:0')
EPOCH 6:
0.5821993468063218
tensor(71.54018402, device='cuda:0')
EPOCH 7:
0.5763263787542071
tensor(72.57254791, device='cuda:0')
EPOCH 8:
0.5652851579444749
tensor(73.29799652, device='cuda:0')
EPOCH 9:
0.5583260027425629
tensor(74.24665833, device='cuda:0')
EPOCH 10:
0.5512227517153535
tensor(75.47433472, device='cuda:0')
EPOCH 11:
0.5429055956857545
tensor(76.14397430, device='cuda:0')
EPOCH 12:
0.5331535908792701
tensor(77.95759583, device='cuda:0')
EPOCH 13:
0.5274207363171237
tensor(79.07366180, device='cuda:0')
EPOCH 14:
0.5248082838952541
tensor(78.57143402, device='cuda:0')
EPOCH 15:
0.52053167032344
tensor(79.15737152, device='cuda:0')
EPOCH 16:
0.516533245

In [None]:
acc = 0
iteration = 0
for data in test_dataloader:
  iteration += 1
  inputs, labels = data
  if torch.cuda.is_available():
    inputs = inputs.cuda()
    labels = labels.cuda()
  outputs = model(inputs)
  predictions = torch.argmax(outputs, axis=1)
  correct_labels = labels.squeeze().int()

  acc += (predictions == correct_labels).int().sum()/len(labels) * 100
print(acc/iteration)

tensor(77.57812500, device='cuda:0')


In [None]:
inputs, labels
if torch.cuda.is_available():
  inputs = inputs.cuda()
  labels = labels.cuda()
outputs = model(inputs, 4)


  return self._call_impl(*args, **kwargs)


In [None]:
inputs.shape

torch.Size([32, 500, 1])

In [None]:
labels.shape

torch.Size([32, 1])

In [None]:
outputs[0]

tensor([0.70790344, 0.29209656], device='cuda:0', grad_fn=<SelectBackward0>)

In [None]:
labels[0]

tensor([0.], device='cuda:0')

In [None]:
predictions

tensor([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 1, 0, 0, 1, 0], device='cuda:0')

In [None]:
labels = labels.squeeze()

In [None]:
(predictions == labels).int().sum()/len(labels) * 100

tensor(56.25000000, device='cuda:0')

In [None]:
22/32

0.6875

In [None]:
try1 = EncoderClassifier(inputs=xp, embed_size=embed_size, num_heads=4, ff_dim=ff_dim, dropout=dropout, num_blocks=4)

In [None]:
try1.cuda()

EncoderClassifier(
  (encoder_block): Sequential(
    (encoder0): PytorchEncoder2(
      (embedding): Linear(in_features=1, out_features=256, bias=True)
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (linear1): Linear(in_features=256, out_features=1, bias=True)
      (dropout1): Dropout(p=0.25, inplace=False)
      (layer_norm1): LayerNorm((1,), eps=1e-06, elementwise_affine=True)
      (conv1): Conv1d(1, 4, kernel_size=(1,), stride=(1,))
      (relu1): ReLU()
      (dropout2): Dropout(p=0.25, inplace=False)
      (conv2): Conv1d(4, 1, kernel_size=(1,), stride=(1,))
      (layer_norm2): LayerNorm((500,), eps=1e-06, elementwise_affine=True)
    )
    (encoder1): PytorchEncoder2(
      (embedding): Linear(in_features=1, out_features=256, bias=True)
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  

In [None]:
try1.forward(xp)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [None]:
!pip install torchsummaryX==1.1.0

Collecting torchsummaryX==1.1.0
  Downloading torchsummaryX-1.1.0-py3-none-any.whl (2.9 kB)
Installing collected packages: torchsummaryX
Successfully installed torchsummaryX-1.1.0


In [None]:
from torchsummaryX import summary
summary(try1, torch.zeros((1, 500, 1)).cuda())

In [None]:
for _ in range(num_transformer_blocks):
  xp = pt_model(xp)
xp = torch.squeeze(nn.AvgPool1d(kernel_size=1)(xp), 2)

for dim in mlp_units:
  pt_dense1 = nn.Linear(xp.shape[-1], dim)
  xp = pt_dense1(xp)
  xp = nn.ReLU()(xp)
  # xp = nn.Dropout(mlp_dropout)(xp)
pt_dense2 = nn.Linear(mlp_units[0], 2)
pytorch_outputs = pt_dense2(xp)
pytorch_outputs = nn.Softmax()(pytorch_outputs)

In [None]:
pt_weights1 = pt_dense1.weight.detach().numpy().transpose()
pt_bias1 = pt_dense1.bias.detach().numpy()

pt_weights2 = pt_dense2.weight.detach().numpy().transpose()
pt_bias2 = pt_dense2.bias.detach().numpy()


for _ in range(num_transformer_blocks):
  xt = keras_encoder(xt, head_size, num_heads, ff_dim, dropout)
xt = layers.GlobalAveragePooling1D(data_format="channels_first")(xt)
for dim in mlp_units:
  keras_dense = layers.Dense(dim, activation='relu')
  keras_dense(xt)
  keras_dense.set_weights([pt_weights1, pt_bias1])
  xt = keras_dense(xt)
  # xt = layers.Dropout(mlp_dropout)(xt)
keras_dense2 = layers.Dense(2, activation='softmax')
keras_dense2(xt)
keras_dense2.set_weights([pt_weights2, pt_bias2])
keras_outputs = keras_dense2(xt)

In [None]:
print(xt[0][0:5])
print(xp[0][0:5])

tf.Tensor([0.        1.2949885 0.        0.        0.       ], shape=(5,), dtype=float32)
tensor([0.00000000, 1.29498851, 0.00000000, 0.00000000, 0.00000000],
       grad_fn=<SliceBackward0>)


In [None]:
print(keras_outputs)
print(pytorch_outputs)

tf.Tensor([[0.46704075 0.53295934]], shape=(1, 2), dtype=float32)
tensor([[0.46704066, 0.53295940]], grad_fn=<SoftmaxBackward0>)


In [None]:
keras_loss = keras.losses.sparse_categorical_crossentropy(y, keras_outputs)
print(keras_loss)

tf.Tensor([0.6293102], shape=(1,), dtype=float32)


In [None]:
y

tensor([[1.]])

In [None]:
pt_loss = nn.CrossEntropyLoss()(pytorch_outputs, y.to(torch.long).reshape(-1))

In [None]:
pt_loss

tensor(0.66073090, grad_fn=<NllLossBackward0>)

# pytorch training

In [None]:
class pytorch_model(nn.Module):
  def __init__(self):
    super(pytorch_model, self).__init__()

  def forward(self, x):
    x = self.encoder(x)

# keras training

In [None]:
def readucr(filename):
    data = np.loadtxt(filename, delimiter="\t")
    y = data[:, 0]
    x = data[:, 1:]
    return x, y.astype(int)


root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"

x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
x_test, y_test = readucr(root_url + "FordA_TEST.tsv")

x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

n_classes = len(np.unique(y_train))

idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]

y_train[y_train == -1] = 0
y_test[y_test == -1] = 0

In [None]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = keras_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_last")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation="softmax")(x)
    return keras.Model(inputs, outputs)


In [None]:
x_train.shape[1:]

(3601, 500, 1)

In [None]:
y_train.shape

(3601,)

In [None]:
input_shape = x_train.shape[1:]

model = build_model(
    input_shape,
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[128],
    mlp_dropout=0.4,
    dropout=0.25,
)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    metrics=["sparse_categorical_accuracy"],
)
model.summary()

callbacks = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]

model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=150,
    batch_size=1,
    callbacks=callbacks,
)

model.evaluate(x_test, y_test, verbose=1)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 500, 1)]             0         []                            
                                                                                                  
 multi_head_attention_22 (M  (None, 500, 1)               7169      ['input_1[0][0]',             
 ultiHeadAttention)                                                  'input_1[0][0]']             
                                                                                                  
 dropout_65 (Dropout)        (None, 500, 1)               0         ['multi_head_attention_22[0][0
                                                                    ]']                           
                                                                                              

KeyboardInterrupt: 

In [None]:
keras_model_outputs = model.predict(sample_x)



In [None]:
keras_model_outputs.shape

(1, 2)

In [None]:
keras_model_outputs

array([[0.49983075, 0.5001692 ]], dtype=float32)