In [1]:
# importing necessary libraries
import tensorflow as tf
import transformers
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import csv
from transformers import BertTokenizer, TFBertModel
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
f = open('spotify_million_playlist_dataset_challenge/challenge_set.json')

js = json.load(f)
playlists = js['playlists']
titles = []
tracks = []
trackID_to_name = {}

for playlist in playlists:
    if not playlist['tracks'] or 'name' not in playlist:
        continue
    titles.append(playlist['name'].lower())
    tracks.append(' '.join(track['track_uri'].split(":")[2] for track in playlist['tracks']))
    for track in playlist['tracks']:
        trackID_to_name[track['track_uri'].split(":")[2].lower()] = track['track_name']


In [3]:
def tokenize(sentences):
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [4]:
titles_tokens, title_tokenizer = tokenize(titles)
tracks_tokens, track_tokenizer = tokenize(tracks)

title_vocab = len(title_tokenizer.word_index) + 1
track_vocab = len(track_tokenizer.word_index) + 1

max_title_length = int(len(max(titles_tokens, key=len)))
max_track_length = int(len(max(tracks_tokens, key=len)))

pad_titles = pad_sequences(titles_tokens, max_title_length, padding = "post")
pad_tracks = pad_sequences(tracks_tokens, max_track_length, padding = "post")

pad_titles = pad_titles.reshape(*pad_titles.shape, 1)
pad_tracks = pad_tracks.reshape(*pad_tracks.shape, 1)

In [5]:
input_sequence = Input(shape=(max_title_length,))
embedding = Embedding(input_dim=title_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_track_length)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(track_vocab))(decoder)

In [6]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 9)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 9, 128)            285440    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 100, 64)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 64)           33024     
_________________________________________________________________
time_distributed (TimeDistri (None, 100, 63997)        4159805   
_________________________________________________________________
activation (Activation)      (None, 100, 63997)        0     

In [7]:
model_results = enc_dec_model.fit(pad_titles, pad_tracks, batch_size=30, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [8]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = 'empty'

    return ' '.join([index_to_words[prediction] for prediction in np.argsort(logits, axis=1)[:, -1 * random.randrange(2, 10)]])
    #return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# index = 5679
# print(titles[index])
# print(logits_to_sentence(enc_dec_model.predict(pad_titles[index])[0], track_tokenizer))
# strlog = logits_to_sentence(enc_dec_model.predict(pad_titles[index])[0], track_tokenizer)
# s = set()
# for i in strlog.split():
#     s.add(trackID_to_name[i])
# print(s)


In [9]:
def predict_tracklist(title, enc_dec_model, title_tokenizer, track_tokenizer, max_title_length, max_track_length):
    input_title_tokens = title_tokenizer.texts_to_sequences([title.lower()])
    pad_input_title = pad_sequences(input_title_tokens, max_title_length, padding="post")
    pad_input_title = pad_input_title.reshape(*pad_input_title.shape, 1)

    predictions = enc_dec_model.predict(pad_input_title)[0]

    predicted_sentence = logits_to_sentence(predictions, track_tokenizer)

    track_names = set()
    for track_id in predicted_sentence.split():
        track_names.add(trackID_to_name.get(track_id, "Unknown Track Name"))

    return predicted_sentence, track_names

input_title = "summer"
predicted_sentence, track_names = predict_tracklist(input_title, enc_dec_model, title_tokenizer, track_tokenizer, max_title_length, max_track_length)

print(f"Input Title: {input_title}")
print(f"Predicted Tracklist: {predicted_sentence}")
print(f"Extracted Track Names: {track_names}")

Input Title: summer
Predicted Tracklist: 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 5nqbuaeteogdd6hhcre0dz 7yyrtczmciyzzjlnzgc9ol 7kxjtscq5nl1loytl7xaws 0v9wz8o0bt8du38r4ddjeh 0v9wz8o0bt8du38r4ddjeh 0v9wz8o0bt8du38r4ddjeh 0v9wz8o0bt8du38r4ddjeh 62vpwi1chwfy7tmicsstl8 62vpwi1chwfy7tmicsstl8 62vpwi1chwfy7tmicsstl8 62vpwi1chwfy7tmicsstl8 62vpwi1chwfy7tmicsstl8 5xjjdnpkwmbuwe79gv0nxk 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 6gbfpufcjlzwgx4lenp6h2 343ybumqhu19cgogarutsd 343ybumqhu19cgogarutsd 343ybumqhu19cgogarutsd 343ybumqhu19cgogarutsd 1xznggdreh1oqq0x