In [6]:
import torch
import torch.nn as nn
import altair as alt
import pandas as pd
import numpy as np
import warnings
import tokenizers
from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path
import sys
import json
from config import Config
from model import Model
from train import DataSetLoader, Training
from dataset import TranslationDataset
from config import Config
from transformer import Transformer
from transformer import TransformerBuilder #Didnt let me import build_transformer itself

warnings.filterwarnings("ignore")

In [7]:
# make sure that the ipynb notebook launches from project root folder to be able to reuse existing model/tokenizers but still having access to modules
import os

print(f"Starting directory: {os.getcwd()}")
if os.path.basename(os.getcwd()) == "src":
	os.chdir('../')
	print(f"Moved working directory to: {os.getcwd()}")

Starting directory: d:\Github\SUMMIT


In [8]:
# Define the device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
torch.cuda.empty_cache() #Frees memory no longer in use 

Using device: cuda


In [9]:
config_file_path = Path('config.json')
config = Config(config_file_path)
model = Model(config)
train_ds, validation_ds, test_ds, vocab_source, vocab_target = DataSetLoader.get_dataset(model)
validation_dataloader = DataLoader(validation_ds, batch_size=1, shuffle=True)


Base directory for model/training data: train\opus_books
Tokenize directory: train\opus_books\tokenize
Checkpoint directory: train\opus_books\checkpoints
Model directory: train\opus_books\model
Checking devices...
... found cuda
Seed: 69420
Loading raw dataset...
Creating tokenizers...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\de.json
Loading existing tokenizer file for language de...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\en.json
Loading existing tokenizer file for language en...
Finding longest items...
Longest items found: de: 479, en: 466
number of rows in raw dataset: 51467
number of items above a certain number): 7202
number of rows in filtered raw dataset: 44265
New longest items found: de: 47, en: 47
Dataset reduced by 116.27019089574155%
Splitting dataset...


In [10]:

model.load_latest_model()


Loading tokenizers...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\de.json
Loading existing tokenizer file for language de...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\en.json
Loading existing tokenizer file for language en...
Loading model


FileNotFoundError: No existing model found!

In [None]:
val_iter = iter(validation_dataloader) #Creating an iterator in an extra Code-window because it restarts at the same sentence each time it gets called

In [None]:
#Checking if it really works

batch = next(val_iter) # Loads the next iteration of the validation, goes in sequence, not in parallel
encoder_input = batch["to_encoder"].to(device)  # Gets the encoder-input of the item in the batch
decoder_input = batch["to_decoder"].to(device)

#print(encoder_input)
encoder_input_tokens = [vocab_source.id_to_token(index) for index in encoder_input.cpu().numpy().flatten()]
decoder_input_tokens = [vocab_target.id_to_token(index) for index in decoder_input.cpu().numpy().flatten()]
print(encoder_input_tokens)
print(decoder_input_tokens)

batch = next(val_iter)
encoder_input = batch["to_encoder"].to(device)  # Gets the encoder-input of the item in the batch
decoder_input = batch["to_decoder"].to(device)

decoder_input_tokens = [vocab_target.id_to_token(index) for index in decoder_input.cpu().numpy().flatten()]
encoder_input_tokens = [vocab_source.id_to_token(index) for index in encoder_input.cpu().numpy().flatten()]
print(encoder_input_tokens)
print(decoder_input_tokens)


['<S>', 'Ein', 'junger', ',', 'mit', 'Wronski', '<U>', '<U>', ',', 'durch', 'den', 'sie', 'Nachrichten', 'erhielt', 'und', 'der', 'durch', 'Vermittlung', 'der', 'Gräfin', 'Lydia', 'Iwanowna', 'eine', 'Vergünstigung', 'zu', 'erlangen', 'hoffte', ',', 'teilte', 'ihr', 'mit', ',', 'daß', 'die', 'beiden', 'ihre', 'geschäftlichen', 'Angelegenheiten', 'erledigt', 'hätten', 'und', 'am', 'folgenden', 'Tage', 'abreisen', 'würden', '.', '<E>', '<P>', '<P>']
['<S>', 'A', 'young', 'adjutant', ',', 'a', 'comrade', 'of', 'Vronsky', "'", 's', ',', 'through', 'whom', 'she', 'had', 'her', 'information', ',', 'who', 'hoped', 'through', 'her', 'influence', 'to', 'obtain', 'a', 'concession', ',', 'told', 'her', 'that', 'they', 'had', 'finished', 'their', 'affairs', 'and', 'were', 'leaving', 'Petersburg', 'next', 'day', '.', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>']
['<S>', 'Aber', 'ich', 'glaubte', ',', 'daß', 'seine', '<U>', ',', 'seine', 'Härte', 'und', 'seine', 'früheren', 'Sünden', '(', 'ich', 'sage',

In [None]:
encoder_input_tokens = encoder_input.squeeze(0).cpu().numpy()
print(encoder_input_tokens)
#decoder_input_tokens = [vocab_target.id_to_token(idx) for idx in decoder_input.cpu().numpy()]

[    1    75    15   537     4    24    64     0     4    64  6278     6
    64  1327  5674   276    15   708  3394     4    85    86   247    11
    14 26316    10    88   564   135 10135    13   362    67  3525     0
   174     5     2     3     3     3     3     3     3     3     3     3
     3     3]


In [None]:
#Loading the next batch from the validation set
max_tokens = config.train_config["max_sentence_tokens"]

def load_batch():
	batch = next(val_iter) # Loads the next iteration of the validation, goes in sequence, not in parallel
	encoder_input = batch["to_encoder"].to(device)  # Gets the encoder-input of the item in the batch
	decoder_input = batch["to_decoder"].to(device)  # Gets the decoder-input of the item in the batch
	encoder_mask = batch["mask_encoder"].to(device) # Gets the encoder_mask
	decoder_mask = batch["mask_decoder"].to(device) # Gets the decoder_mask

	#Vocab_source is the tokenizer which maps Id's to words, given is , which is the tensor that contains all of the id's.
	#Via .numpy this is then transformed into a numpy array, which is done so as to be iterable.
	#Converts each token into a word from the vocabulary list
	encoder_input_tokens = [vocab_source.id_to_token(idx) for idx in encoder_input.squeeze(0).cpu().numpy()]
	decoder_input_tokens = [vocab_target.id_to_token(idx) for idx in decoder_input.squeeze(0).cpu().numpy()]


	#encoder_input = encoder_input.unsqueeze(0) #This has to be reworked
	assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

# Assert is useless here because we only ever get tokens, without a batch?
# Has to be looked over again, a reason has to be provided

	load_sentence_values = decode_sentence(model.model, encoder_input, encoder_mask, vocab_source, vocab_target, max_tokens, device)
	return batch, encoder_input_tokens, decoder_input_tokens


def decode_sentence(model, to_encoder, mask_encoder, tokenizer_source, tokenizer_target, config, device):
	s_token = tokenizer_target.token_to_id("<S>")
	e_token = tokenizer_target.token_to_id("<E>")

	encoded = model.encode(to_encoder, mask_encoder) #Encoder processes the input tokens with the encoder attention mask

	to_decoder = torch.empty(1,1).fill_(s_token).type_as(to_encoder).to(device)
	# Initializes tensor of shape (1,1), fills it with SOS tokens, sets it to be of the same type as to_encoder, gets it onto cuda

	for iteration in range(0, max_tokens): # iterates until it reaches the limit for the sequence length

		mask_decoder = TranslationDataset.triangular_mask(to_decoder.size(1)).type_as(mask_encoder).to(device) #Creates the decoder mask which ensures that the model cannot see into the future
		output = model.decode(encoded, mask_encoder, to_decoder, mask_decoder) #Now uses the mask to get the attention, returns a tensor of logits?

		p = model.project(output[:, -1])
		not_needed_values, most_likely = torch.max(p, dim=1)
		if most_likely == e_token: break 

		to_decoder = torch.cat(
			[
				to_decoder,  # Last input
				torch.empty(1,1).type_as(to_encoder).fill_(most_likely.item()).to(device)  # Creates new tensor with shape (1,1), makes sure of the type, fills it with predicted token, and puts it onto device used
			], dim=1
		)
		#dim=1 concats it along the row, dim=0 would stack them on top of each other
	return to_decoder.squeeze(0)


In [None]:
def matrix_to_dataframe(attention_matrix, max_row, max_col, row_tokens, col_tokens): # Converts attention matrix into Pandas Dataframe
	data = [] #List for the data to be stored in
	for row in range(attention_matrix.shape[0]): # loop over the rows
		for col in range(attention_matrix.shape[1]): # loop over the columns

			if row < max_row and col < max_col:
				attention_value = float(attention_matrix[row, col])

				row_token = row_tokens[row] if row < len(row_tokens) else "<blank>"
				col_token = col_tokens[col] if col < len(col_tokens) else "<blank>"

				row_label = f"{row:03d} {row_token}"
				col_label = f"{col:03d} {col_token}"
				
				data.append((row, col, attention_value, row_label, col_label))
	dataframe = pd.DataFrame(data, columns=["row", "column", "value", "row_token", "col_token"])

	return dataframe


def get_attention_map(attention_type: str, layer: int, head: int):

	if attention_type == "encoder":
		attention = model.model.encoder.encoder_module_list._modules['0'].self_attention_layer.attention_scores
	
	elif attention_type == "decoder":
		attention = model.model.decoder.decoder_module_list._modules['0'].self_attention_layer.attention_scores

	elif attention_type == "encoder-decoder":
		attention = model.model.decoder.decoder_module_list._modules['0'].cross_attention_layer.attention_scores
	
	return attention[0, head].data #.data gives the raw data without any tracking noise
	
	#Shape (batch_size, num_heads, query_len, key_len) Gets the first sample in the batch for inference and the specified attention head, .data to extract raw tensor values

def attention_map(attention_type, layer, head, row_tokens, col_tokens, max_sentence_len):
	dataframe = matrix_to_dataframe(get_attention_map(attention_type, layer, head), max_sentence_len, max_sentence_len, row_tokens, col_tokens) 

	return(	alt.Chart(dataframe).mark_rect().encode(
		x = "col_token", # X and Y have to match the name of the dataframe columns
		y = "row_token",
		color = alt.Color("value", scale=alt.Scale(scheme="greens")),
		tooltip=["row", "column", "value", "row_token", "col_token"]
	).properties(height=200, width=200, title = f"Layer {layer}, Head {head}")
	)

def get_all_attention_maps(attention_type: str, layers: list[int], heads: list[int], row_tokens, col_tokens, max_sentence_len: int):
	charts = [] 
	for layer in layers:
		rowCharts = []
		for head in heads:
			rowCharts.append(attention_map(attention_type, layer, head, row_tokens, col_tokens, max_sentence_len))
		charts.append(alt.hconcat(*rowCharts))
	return alt.vconcat(*charts)

	# The * operator unpacks the list, so instead of [1, 2, 3] it gets (1, 2, 3) and it can work with that
	#Attention of all heads and all layers that are given as the input

In [None]:
batch, encoder_input_tokens, decoder_input_tokens = load_batch()
print(batch.keys())  # Shows all available keys in the batch

print(f'Source: {batch['text_source']}')
print(f'Target: {batch['text_target']}')
sentence_len = encoder_input_tokens.index("<P>")

dict_keys(['to_encoder', 'to_decoder', 'label', 'text_source', 'text_target', 'mask_encoder', 'mask_decoder'])
Source: ['Tom brüllte, bis er heiser war, aber vergebens.']
Target: ['Tom whooped until he was hoarse, but it was of no use.']


In [None]:
layers = [0, 1, 2]
heads = [0, 1, 2, 3, 4, 5, 6, 7]

# Encoder Self-Attention
get_all_attention_maps("encoder", layers, heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_len))

In [None]:
# Decoder Self-Attention
get_all_attention_maps("decoder", layers, heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_len))

In [None]:
# Cross-Attention
get_all_attention_maps("encoder-decoder", layers, heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_len))