In [1]:
import torch
import torch.nn as nn
import altair as alt
import pandas as pd
import numpy as np
import tokenizers
from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path
import sys
import json

from config import Config
from model import Model
from train import DataSetLoader, Training
from dataset import TranslationDataset
from config import Config
from transformer import Transformer
from transformer import TransformerBuilder 


In [2]:
# make sure that the ipynb notebook launches from project root folder to be able to reuse existing model/tokenizers but still having access to modules
import os

print(f"Starting directory: {os.getcwd()}")
if os.path.basename(os.getcwd()) == "src":
	os.chdir('../')
	print(f"Moved working directory to: {os.getcwd()}")

Starting directory: d:\Github\SUMMIT\src
Moved working directory to: d:\Github\SUMMIT


In [3]:
# Define the device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
torch.cuda.empty_cache() #Frees memory no longer in use 

Using device: cuda


In [4]:
config_file_path = Path('config.json')
config = Config(config_file_path)
model = Model(config)
model.load_latest_model() 

Base directory for model/training data: train\opus_books
Tokenize directory: train\opus_books\tokenize
Checkpoint directory: train\opus_books\checkpoints
Model directory: train\opus_books\model
Checking devices...
... found cuda
Seed: 69420
Loading tokenizers...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\de.json
Loading existing tokenizer file for language de...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\en.json
Loading existing tokenizer file for language en...
Loading model
Found latest model at: train\opus_books\model\latest.pth


  state = torch.load(model_path, map_location=self.config.device)


In [5]:
train_ds, validation_ds, test_ds, tokenizer_source, tokenizer_target = DataSetLoader.get_dataset(model)
validation_dataloader = DataLoader(validation_ds, batch_size=1, shuffle=True)
val_iter = iter(validation_dataloader) 

Loading raw dataset...
Creating tokenizers...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\de.json
Loading existing tokenizer file for language de...
Looking for tokenizer file: D:\Github\SUMMIT\train\opus_books\tokenize\en.json
Loading existing tokenizer file for language en...
Finding longest items...
Longest items found: de: 479, en: 466
number of rows in raw dataset: 51467
number of items above a certain number): 7202
number of rows in filtered raw dataset: 44265
New longest items found: de: 47, en: 47
Dataset reduced by 116.27019089574155%
Splitting dataset...


In [6]:
#Loading the next batch from the validation set
max_tokens = config.train_config["max_sentence_tokens"]
output = 0

def load_batch():
	batch = next(val_iter) # Loads the next iteration of the validation
	
	#Gets the inputs via the keys from the dictionary that the dataset we downloaded provides
	encoder_input = batch["to_encoder"].to(device)  
	decoder_input = batch["to_decoder"].to(device)  
	encoder_mask = batch["mask_encoder"].to(device) 
	decoder_mask = batch["mask_decoder"].to(device) 

	encoder_input_tokens = [tokenizer_source.id_to_token(idx) for idx in encoder_input.squeeze(0)] 
	decoder_input_tokens = [tokenizer_target.id_to_token(idx) for idx in decoder_input.squeeze(0)]
	#Has to be squeezed since the DataLoader always provides a Batch dimension, even if it is 1 as in this case

	decode_sentence(model.model, encoder_input, encoder_mask, tokenizer_source, tokenizer_target, max_tokens, device)
	#Is being called to load the attention scores for the new sentence

	return batch, encoder_input_tokens, decoder_input_tokens


def decode_sentence(model, to_encoder, mask_encoder, tokenizer_source, tokenizer_target, config, device):
	s_token = tokenizer_target.token_to_id("<S>")
	e_token = tokenizer_target.token_to_id("<E>")

	encoded = model.encode(to_encoder, mask_encoder) 
	to_decoder = torch.empty(1,1).fill_(s_token).type_as(to_encoder).to(device)

	for iteration in range(0, max_tokens): # iterates until it reaches the limit for the sequence length

		mask_decoder = TranslationDataset.triangular_mask(to_decoder.size(1)).type_as(mask_encoder).to(device) 
		output = model.decode(encoded, mask_encoder, to_decoder, mask_decoder) #Returns a tensor of logits

		p = model.project(output[:, -1])						#Model projects the decoder output into a logits vector over the vocabulary
		not_needed_values, most_likely = torch.max(p, dim=1)	#Selects the most likely next tokens, values are not needed here, only the token itself
		if most_likely == e_token: break 

		to_decoder = torch.cat([ to_decoder,torch.empty(1,1).type_as(to_encoder).fill_(most_likely.item()).to(device)], dim=1)  
		# Concats the new token to to_decoder, the next most_likely token will then be different and added again

		#Output = 3D Tensor


In [7]:
#Checking if it really works

batch = next(val_iter) # Loads the next iteration of the validation
encoder_input = batch["to_encoder"].to(device)  # Gets the encoder-input of the item in the batch
decoder_input = batch["to_decoder"].to(device)

encoder_input_tokens = [tokenizer_source.id_to_token(index) for index in encoder_input.cpu().squeeze(0)] #Turns ID's into tokens
decoder_input_tokens = [tokenizer_target.id_to_token(index) for index in decoder_input.cpu().squeeze(0)]

print(encoder_input_tokens)
print(decoder_input_tokens)

['<S>', 'Sie', 'sind', 'vergoldet', ',', 'aber', 'sie', 'zeigen', 'die', 'Stunde', 'nicht', 'an', ',', 'und', 'der', 'Zeiger', 'kann', 'sie', 'entbehren', '.«', '<E>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>']
['<S>', 'They', 'are', 'gilt', ',', 'but', 'they', 'do', 'not', 'indicate', 'the', 'hour', ';', 'and', 'the', 'hands', 'can', 'get', 'on', 'without', 'them', '."', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>']


In [8]:
def matrix_to_dataframe(attention_matrix, max_row, max_col, row_tokens, col_tokens): # Converts attention matrix into Pandas Dataframe
	data = [] 
	for row in range(attention_matrix.shape[0]): 
		for col in range(attention_matrix.shape[1]): 

			if row < max_row and col < max_col:
				attention_value = float(attention_matrix[row, col])

				row_token = row_tokens[row] if row < len(row_tokens) else "<blank>"
				col_token = col_tokens[col] if col < len(col_tokens) else "<blank>"

				row_label = f"{row:03d} {row_token}"
				col_label = f"{col:03d} {col_token}"
				
				data.append((row, col, attention_value, row_label, col_label)) # Adding tuples to the list
	dataframe = pd.DataFrame(data, columns=["row", "column", "value", "row_token", "col_token"]) #Pass the data and the column names we assign

	return dataframe


def get_attention_map(attention_type: str, layer: int, head: int):
	#Based on input attention_type one of the attention types is chosen and returned.
	if attention_type == "encoder":
		attention = model.model.encoder.encoder_module_list._modules['0'].self_attention_layer.attention_scores
	
	elif attention_type == "decoder":
		attention = model.model.decoder.decoder_module_list._modules['0'].self_attention_layer.attention_scores

	elif attention_type == "encoder-decoder":
		attention = model.model.decoder.decoder_module_list._modules['0'].cross_attention_layer.attention_scores
	
	#attention = Size([1, 8, 50, 50]) is the batch size, heads, Query_len and Key_len
	#print(attention[0, head].shape) = Size([50, 50])

	return attention[0, head].data #.data gives raw tensor data without gradient tracking from requires_grad (from backpropagation)

def attention_map(attention_type, layer, head, row_tokens, col_tokens, max_sentence_len):
	dataframe = matrix_to_dataframe(get_attention_map(attention_type, layer, head), max_sentence_len, max_sentence_len, row_tokens, col_tokens) 

	return (			
		alt.Chart(dataframe).mark_rect().encode(
		x = "col_token", # X and Y have to match the name of the dataframe columns
		y = "row_token",
		color = alt.Color("value", scale=alt.Scale(scheme="greens")),
		tooltip=["row", "column", "value", "row_token", "col_token"] #Which values are shown when hovering above the rectangles
	).properties(height=200, width=200, title = f"Layer {layer}, Head {head}") 
	)

def get_all_attention_maps(attention_type: str, layers: int, heads: int, row_tokens, col_tokens, max_sentence_len: int):
	charts = [] 
	for layer in range(layers):
		rowCharts = []
		for head in range(heads):
			rowCharts.append(attention_map(attention_type, layer, head, row_tokens, col_tokens, max_sentence_len))
		charts.append(alt.hconcat(*rowCharts))
	return alt.vconcat(*charts)

	# The * operator unpacks the list, so instead of [1, 2, 3] it gets (1, 2, 3) and it can work with that
	# Attention matrices of all heads and all layers that are given as the input

In [9]:
batch, encoder_input_tokens, decoder_input_tokens = load_batch()
print(batch.keys())  # Shows all available keys in the batch

print(f'Source: {batch['text_source']}')
print(f'Target: {batch['text_target']}')
sentence_len = encoder_input_tokens.index("<P>")

dict_keys(['to_encoder', 'to_decoder', 'label', 'text_source', 'text_target', 'mask_encoder', 'mask_decoder'])
Source: ['Dieser funkelnde Anzug, auf welchem das Licht spielte, schien an allen Falten von Flammen zu schillern.']
Target: ['This splendid costume, on which the light played, seemed glazed with flame on every fold.']


In [10]:
number_heads = int(config.train_config["num_heads"])
number_layer = int(config.train_config["num_encoder_blocks"])

# Self-Attention-Encoder
get_all_attention_maps("encoder", number_layer, number_heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_len))

In [11]:
# Self-Attention-Decoder
get_all_attention_maps("decoder", number_layer, number_heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_len))

In [12]:
# Cross-Attention
get_all_attention_maps("encoder-decoder", number_layer, number_heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_len))