In [1]:
import torch
import torch.nn as nn
import altair as alt
import pandas as pd
import numpy as np

import tokenizer
import tokenizers
from pathlib import Path
from torch.utils.data import Dataset, DataLoader, random_split

import sys
import json
from config import Config
from model import Model

In [2]:
# Define the device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
torch.cuda.empty_cache() 

Using device: cuda


In [3]:
from train import DataSetLoader, Training
from dataset import TranslationDataset
from config import Config
from transformer import Transformer
from transformer import TransformerBuilder #Didnt let me import build_transformer itself

In [4]:
config_file_path = Path('../config.json') #Path of the new config file
config = Config(config_file_path) 
model = Model(config)

model.load_latest_model()

Base directory for model/training data: train\opus_books
Tokenize directory: train\opus_books\tokenize
Checkpoint directory: train\opus_books\checkpoints
Model directory: train\opus_books\model
Checking devices...
... found cuda
Seed: 69420
Loading tokenizers...
Looking for tokenizer file: D:\Github\SUMMIT\src\train\opus_books\tokenize\de.json
Loading existing tokenizer file for language de...
Looking for tokenizer file: D:\Github\SUMMIT\src\train\opus_books\tokenize\en.json
Loading existing tokenizer file for language en...
Loading model
Found latest model at: train\opus_books\model\latest.pth


  state = torch.load(model_path, map_location=self.config.device)


In [5]:
train_ds, validation_ds, test_ds, tokenizer_source, tokenizer_target = DataSetLoader.get_dataset(model) #Get needed variables
validation_dataloader = DataLoader(validation_ds, batch_size=1, shuffle=True) #Getting the dataloader which loads in validation sentences

Loading raw dataset...
Creating tokenizers...
Looking for tokenizer file: D:\Github\SUMMIT\src\train\opus_books\tokenize\de.json
Loading existing tokenizer file for language de...
Looking for tokenizer file: D:\Github\SUMMIT\src\train\opus_books\tokenize\en.json
Loading existing tokenizer file for language en...
Finding longest items...
Longest items found: de: 479, en: 466
number of rows in raw dataset: 51467
number of items above a certain number): 7202
number of rows in filtered raw dataset: 44265
New longest items found: de: 47, en: 47
Dataset reduced by 116.27019089574155%
Splitting dataset...


In [6]:
print(validation_dataloader)
val_iter = iter(validation_dataloader) #Creating an iterator in an extra Code-window because it restarts at the same sentence each time it gets called
#print(val_iter) Object itself is just an iterator

<torch.utils.data.dataloader.DataLoader object at 0x000001AF6DA06480>


In [7]:
"""
#Loading the next batch from the validation set
max_tokens = config.train_config["max_sentence_tokens"] #Max allowed tokens per sentence as per config

batch = next(val_iter)
print(batch)
encoder_input = batch["to_encoder"].to(device).squeeze(0)

print(encoder_input)
"""

def load_batch():
	batch = next(val_iter) # Loads the next iteration of the validation

	#Loads inputs and encoder/decoder masks from the dataset via the key
	encoder_input = batch["to_encoder"].to(device)  
	decoder_input = batch["to_decoder"].to(device)  
	encoder_mask = batch["mask_encoder"].to(device) 
	decoder_mask = batch["mask_decoder"].to(device) 

	#Tokenizer_source is the tokenizer which maps Id's to words, given is ,which is the tensor that contains all of the id's.
	#Via .numpy this is then transformed into a numpy array, which is done so as to be iterable.
	#Converts each token into a word from the vocabulary list

	encoder_input_tokens = [tokenizer_source.id_to_token(idx) for idx in encoder_input.squeeze(0)]
	decoder_input_tokens = [tokenizer_target.id_to_token(idx) for idx in decoder_input.squeeze(0)]

	#The tokenizer is from the tokenizer library and is used on the WordLevel and was once loaded on the sentences of the language source
	#and once on the sentences of the target language, which lets them map the id's into the tokens of the given language

	return batch, encoder_input_tokens, decoder_input_tokens


In [8]:
#Checking if it really works
batch = next(val_iter) # Loads the next iteration of the validation, is a Double-Array of tokens. 

encoder_input = batch["to_encoder"].to(device)  # Gets the encoder-input of the item in the batch
decoder_input = batch["to_decoder"].to(device)  # Is gotten like this because ["to_decoder"] is a key to the dictionary of the dataset we got and gives the input that the encoder is supposed to get
#print(encoder_input) Is a double Array of Id's which still have to be turned into tokens/words

encoder_input_tokens = [tokenizer_source.id_to_token(index) for index in encoder_input.squeeze(0)]
decoder_input_tokens = [tokenizer_target.id_to_token(index) for index in decoder_input.squeeze(0)]
#Now the tokenizer of the target and the source vocabulary are used to transform the Id's into tokens.
#Flattening the tensor into a 1D Vector, by using .squeeze(0) to turn it into a 1D Vector is necessary.

print(encoder_input_tokens)
print(decoder_input_tokens)

['<S>', 'Sie', 'sind', 'vergoldet', ',', 'aber', 'sie', 'zeigen', 'die', 'Stunde', 'nicht', 'an', ',', 'und', 'der', 'Zeiger', 'kann', 'sie', 'entbehren', '.«', '<E>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>']
['<S>', 'They', 'are', 'gilt', ',', 'but', 'they', 'do', 'not', 'indicate', 'the', 'hour', ';', 'and', 'the', 'hands', 'can', 'get', 'on', 'without', 'them', '."', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>', '<P>']


In [9]:
batch, encoder_input_tokens, decoder_input_tokens = load_batch()
print(batch.keys())  # Shows all available keys in the batch which are provided by the dataset

dict_keys(['to_encoder', 'to_decoder', 'label', 'text_source', 'text_target', 'mask_encoder', 'mask_decoder'])


In [None]:
model.model.eval()
attention = model.model.encoder.encoder_module_list._modules['0'].self_attention_layer.attention_scores
print(attention)

attention2 = model.model.decoder.decoder_module_list._modules['0'].self_attention_layer.attention_scores
print(attention2)

def matrix_to_dataframe(attention_matrix, max_row, max_col, row_tokens, col_tokens): # Converts attention matrix into Pandas Dataframe
	data = [] #List for the data to be stored in
	for row in range(attention_matrix.shape[0]): # loop over the rows
		for col in range(attention_matrix.shape[1]): # loop over the columns

			if row < max_row and col < max_col:
				attention_value = float(attention_matrix[row, col])

				row_token = row_tokens[row] if row < len(row_tokens) else "<blank>"
				col_token = col_tokens[col] if col < len(col_tokens) else "<blank>"

				row_label = f"{row:03d} {row_token}"
				col_label = f"{col:03d} {col_token}"
				
				data.append((row, col, attention_value, row_label, col_label))
	dataframe = pd.DataFrame(data, columns=["row", "column", "value", "row_token", "col_token"])

	return dataframe


def get_attention_map(attention_type: str, layer: int, head: int):

	if attention_type == "encoder":
		attention = model.model.encoder.encoder_module_list._modules['0'].self_attention_layer.attention_scores
	
	elif attention_type == "decoder":
		attention = model.model.decoder.decoder_module_list._modules['0'].self_attention_layer.attention_scores

	elif attention_type == "encoder-decoder":
		attention = model.model.decoder.decoder_module_list._modules['0'].cross_attention_layer.attention_scores
	
	return attention[0, head].data #.data gives the raw data without any tracking noise
	
	#Shape (batch_size, num_heads, query_len, key_len) Gets the first sample in the batch for inference and the specified attention head, .data to extract raw tensor values

def attention_map(attention_type, layer, head, row_tokens, col_tokens, max_sentence_len):
	dataframe = matrix_to_dataframe(get_attention_map(attention_type, layer, head), max_sentence_len, max_sentence_len, row_tokens, col_tokens) 

	return(	alt.Chart(dataframe).mark_rect().encode(
		x = "col_token", # X and Y have to match the name of the dataframe columns
		y = "row_token",
		color = alt.Color("value", scale=alt.Scale(scheme="greens")),
		tooltip=["row", "column", "value", "row_token", "col_token"]
	).properties(height=200, width=200, title = f"Layer {layer}, Head {head}")
	)

def get_all_attention_maps(attention_type: str, layers: list[int], heads: list[int], row_tokens, col_tokens, max_sentence_len: int):
	charts = [] 
	for layer in layers:
		rowCharts = []
		for head in heads:
			rowCharts.append(attention_map(attention_type, layer, head, row_tokens, col_tokens, max_sentence_len))
		charts.append(alt.hconcat(*rowCharts))
	return alt.vconcat(*charts)

	# The * operator unpacks the list, so instead of [1, 2, 3] it gets (1, 2, 3) and it can work with that
	#Attention of all heads and all layers that are given as the input

None
None


In [13]:
print(f'Source: {batch['text_source']}')
print(f'Target: {batch['text_target']}')
sentence_len = encoder_input_tokens.index("<P>") #Gets the position of the first padding token

Source: ['Dieser funkelnde Anzug, auf welchem das Licht spielte, schien an allen Falten von Flammen zu schillern.']
Target: ['This splendid costume, on which the light played, seemed glazed with flame on every fold.']


In [14]:
layers = [0, 1, 2]
heads = [0, 1, 2, 3, 4, 5, 6, 7]

# Encoder Self-Attention
get_all_attention_maps("encoder", layers, heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_len))

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Decoder Self-Attention
get_all_attention_maps("decoder", layers, heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_len))

In [None]:
# Cross-Attention
get_all_attention_maps("encoder-decoder", layers, heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_len))