# Tranformer model with PyTorch

This notebook creates and trains a tranformer with encoder-decoder architecture using PyTorch

To use this model, change the seed number of the ..., then run all cells in the notebook. After the first run of the notebook and the model is trained, new playlists can be generated just by editing and running the last code box.

In [1]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import math
import json
from keras.preprocessing.sequence import pad_sequences

from spotify import SpotifyClient

__Creating the model__

Creating the layers for model. The multi-head attention layer

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, _ = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, _ = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # get different heads, and combine to output
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output
    
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

__Encoder Layer__

In [3]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

__Decoder Layer__

In [4]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, title_mask, track_mask):
        attn_output = self.self_attn(x, x, x, track_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, title_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [5]:
class Transformer(nn.Module):
    def __init__(self, title_vocab_size, track_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(title_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(track_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, track_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        title_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        track_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        track_mask = track_mask & nopeak_mask
        return title_mask, track_mask

    def forward(self, src, tgt):
        title_mask, track_mask = self.generate_mask(src, tgt)

        enc_output = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, title_mask)

        dec_output = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, title_mask, track_mask)

        output = self.fc(dec_output)
        return output

__Loading in the data from the challenge set__

In [6]:
# define a batch size for our experiments
BATCH_SIZE = 35
# define a percentage of the data to use for training
SPLIT_PC = .90

# open the file and convert it to json
f = open('spotify_million_playlist_dataset_challenge/challenge_set.json')
js = json.load(f)
playlists = js['playlists']

titles = []
tracks = []

# process and add the playlist names and tracks to lists
for playlist in playlists:
    if not playlist['tracks'] or 'name' not in playlist:
        continue
    titles.append(playlist['name'].lower()) 
    tracks.append(' '.join(track['track_uri'] for track in playlist['tracks']))

END = int(len(titles)*SPLIT_PC)

In [7]:
# custom tokenizer for separating tracks (just separates by spaces)
# we tried using tokenizers from libraries, but they split up the track URIs, which we didn't want
class Tokenizer:
    def __init__(self):
        self.dictionary = {}
        self.reverse_dictionary = {}

        # Add the padding token
        self.dictionary['<UNK>'] = 0
        self.reverse_dictionary[0] = '<UNK>'
        self.cur_token = 1

    # adds tokens from a list of sentences to the tokenizer
    def fit_on_texts(self, sentences):
        for sentence in sentences:
            self.tokenize(sentence)

    # add tokens from a sentence to the tokenizer, but does not tokenize the sentence
    def tokenize(self, text: str):
        for token in text.split():
            if token not in self.dictionary:
                self.dictionary[token] = self.cur_token
                self.reverse_dictionary[self.cur_token] = token
                self.cur_token += 1

    # returns the token associated with the given character/word, or None if the character does not exist
    def character_to_token(self, character):
        return self.dictionary.get(character, None)

    # returns the character/word associated with the given token, or None if the token does not exist
    def token_to_character(self, token):
        return self.reverse_dictionary.get(token, None)
    
    # tokenizes a list of sentences, returning the tokens
    def texts_to_sequences(self, sentences):
        sequences = []
        for sentence in sentences:
            cur_sentence = []
            for token in sentence.split():
                if token not in self.dictionary:
                    cur_sentence.append(self.dictionary['<UNK>'])
                else:
                    cur_sentence.append(self.dictionary[token])
            sequences.append(cur_sentence)
        return sequences
    
    # returns the characters/words from a sequence of tokens
    def sequence_to_texts(self, sequence: list):
        return [self.token_to_character(token) for token in sequence]

    # returns the size of the tokenizer vocab
    def size(self):
        return len(self.dictionary)

__Tokenize and pad titles and tracks__

In [8]:
# tokenizes the given sentences and returns a list of lists of the tokens
def tokenize(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer.texts_to_sequences(sentences), tokenizer

In [9]:
# tokenize titles and tracks, and also get the tokenizers
titles_tokens, title_tokenizer = tokenize(titles)
tracks_tokens, track_tokenizer = tokenize(tracks)
print(titles_tokens[:10])

title_vocab = title_tokenizer.size() + 1
track_vocab = track_tokenizer.size() + 1
print(f'Vocabularies: {title_vocab}, {track_vocab}')

max_title_length = int(len(max(titles_tokens, key=len)))
max_track_length = int(len(max(tracks_tokens, key=len)))
print(f'Max lengths: {max_title_length}, {max_track_length}')

pad_titles = pad_sequences(titles_tokens, max_title_length, padding = "post")
pad_tracks = pad_sequences(tracks_tokens, max_track_length, padding = "post")
print(f'Example paddings: {pad_titles[:10]}')

title_tensor = torch.LongTensor(pad_titles)
tracks_tensor = torch.LongTensor(pad_tracks)
print(f'Tensor shapes: {title_tensor.shape}, {tracks_tensor.shape}')


[[1], [2], [3, 4], [5, 6], [7], [8, 9], [10], [11], [12, 13], [14]]
Vocabularies: 2487, 63998
Max lengths: 9, 100
Example paddings: [[ 1  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0]
 [ 3  4  0  0  0  0  0  0  0]
 [ 5  6  0  0  0  0  0  0  0]
 [ 7  0  0  0  0  0  0  0  0]
 [ 8  9  0  0  0  0  0  0  0]
 [10  0  0  0  0  0  0  0  0]
 [11  0  0  0  0  0  0  0  0]
 [12 13  0  0  0  0  0  0  0]
 [14  0  0  0  0  0  0  0  0]]
Tensor shapes: torch.Size([7000, 9]), torch.Size([7000, 100])


__Create a data generator for training__

In [10]:
# data generator for title and track tensors
def data_generator(titles: torch.LongTensor, tracks: torch.LongTensor, batch_size: int = BATCH_SIZE):
    i = 0
    while True:
        # return the current batch
        yield titles[i: i + batch_size], tracks[i: i + batch_size]
        
        i += batch_size
        # if we reach the end of the tensor, start from the beginning again
        if i >= len(titles):
            i = 0

In [11]:
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

# initialize a Transformer model (from above) using the declared params
transformer = Transformer(title_vocab, track_vocab, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

transformer

Transformer(
  (encoder_embedding): Embedding(2487, 512)
  (decoder_embedding): Embedding(63998, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(


__Training__

This took about 30 sec for 10 epochs with batch size of 35 on a M2 Macbook Air

In [12]:
epochs = 20

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

# set the model to train mode and create generator for training
transformer.train()
generator = data_generator(title_tensor, tracks_tensor)

for epoch in range(epochs):
    optimizer.zero_grad()
    titles, tracks = next(generator)
    output = transformer(titles, tracks)
    loss = criterion(output.contiguous().view(-1, track_vocab), tracks.contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 11.189005851745605
Epoch: 2, Loss: 11.276976585388184
Epoch: 3, Loss: 11.185345649719238
Epoch: 4, Loss: 11.214712142944336
Epoch: 5, Loss: 11.159171104431152
Epoch: 6, Loss: 11.037936210632324
Epoch: 7, Loss: 11.10622787475586
Epoch: 8, Loss: 11.050960540771484
Epoch: 9, Loss: 11.041683197021484
Epoch: 10, Loss: 10.912346839904785
Epoch: 11, Loss: 11.091769218444824
Epoch: 12, Loss: 11.01407527923584
Epoch: 13, Loss: 11.002462387084961
Epoch: 14, Loss: 10.9982328414917
Epoch: 15, Loss: 10.865768432617188
Epoch: 16, Loss: 10.556499481201172
Epoch: 17, Loss: 10.712580680847168
Epoch: 18, Loss: 10.907352447509766
Epoch: 19, Loss: 10.831652641296387
Epoch: 20, Loss: 10.94340705871582


__Evaluating and generating predictions__

Currently, the model is only returning the same song

In [13]:
# Change the model to eval mode
transformer.eval()

def generate_playlist(seed: int):
    print('Generating playlist:', ' '.join([token for token in title_tokenizer.sequence_to_texts(title_tensor[seed].tolist()) if token != "<UNK>"]))
    output = transformer(title_tensor[seed: seed + 1], tracks_tensor[seed: seed + 1])

    # Get the track with the highest probability, then convert the tokens back to URIs
    output = output.view(-1, track_vocab).argmax(1)
    output = track_tokenizer.sequence_to_texts(output.numpy())

    output = list(set(output))

    # use Spotify API to get the song names
    client = SpotifyClient()
    playlist = client.get_song_titles(output)
    print([f"{track['name']} by {', '.join([artist['name'] for artist in track['artists']])}" for track in playlist['tracks']])

# change to generate for a different seed
generate_playlist(151)

Generating playlist: wedding playlist
['One Dance by Drake, Wizkid, Kyla']
