In [1]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import sklearn
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load the dataset
file_path = '../Data/podcasts_data.csv'
dataset = pd.read_csv(file_path)
dataset.head()

Unnamed: 0,Genre,Podcast Name,Description,Publisher,Total Episodes,Spotify URL,Cover Image URL
0,arts and entertainment,Easy Stories in English,"Learning a language is hard, but Easy Stories ...","Ariel Goodbody, Polyglot English Teacher & Gla...",216,https://open.spotify.com/show/23zdIqNUb0riR51w...,https://i.scdn.co/image/ab6765630000ba8a767693...
1,arts and entertainment,Podcast Buku Kutu,"EPISODE BARU SETIAP SENIN, RABU, dan JUMAT -- ...",Aditya Hadi - PODLUCK,162,https://open.spotify.com/show/3w5zKrbQ6kgB0RKI...,https://i.scdn.co/image/ab6765630000ba8a04fa1a...
2,arts and entertainment,Underwood and Flinch and Other Audiobooks by M...,Underwood and Flinch is a three-time Parsec aw...,Mike Bennett,244,https://open.spotify.com/show/3VwIE3bG0zpTCNzR...,https://i.scdn.co/image/ab6765630000ba8a4e7b42...
3,arts and entertainment,Podcast Resensi Buku,Kumpulan resensi beragam buku berbagai genre d...,Podcast Resensi Buku - PODLUCK,264,https://open.spotify.com/show/6woLsDl6CSntzeWU...,https://i.scdn.co/image/ab6765630000ba8a1e97ef...
4,arts and entertainment,SupremeMasterTV,Supreme Master Television is an international ...,SupremeMasterTV,500,https://open.spotify.com/show/5bCgERRINgZWhauS...,https://i.scdn.co/image/ab6765630000ba8a7899e5...


In [3]:

# Drop rows with NaN values in 'Podcast Name' column
podcast_data = dataset.dropna(subset=['Podcast Name'])

# Extract podcast names
podcast_names = podcast_data['Podcast Name'].values

# Extract relevant columns
podcast_names = podcast_data['Podcast Name'].values
podcast_descriptions = podcast_data['Description'].values
podcast_publishers = podcast_data['Publisher'].values
podcast_spotify_urls = podcast_data['Spotify URL'].values
podcast_cover_image_urls = podcast_data['Cover Image URL'].values



In [4]:
# Tokenization and Vectorization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(podcast_names)

# Convert podcast names to sequences of integers
sequences = tokenizer.texts_to_sequences(podcast_names)

# Pad sequences to have the same length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Get the vocabulary size for the embedding layer
vocab_size = len(tokenizer.word_index) + 1

In [5]:
# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 128)           1199488   
                                                                 
 bidirectional (Bidirectiona  (None, 23, 128)          98816     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 23, 128)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 23, 128)          98816     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 23, 128)           0         
                                                                 
 time_distributed (TimeDistr  (None, 23, 9371)         1

In [6]:
# Prepare labels to match the output shape of the model
labels = np.expand_dims(padded_sequences, axis=-1)

# Train the model
model.fit(padded_sequences, labels, epochs=20, batch_size=64, validation_split=0.2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x16d2739b8e0>

In [7]:
# Load pre-trained Sentence-BERT model
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode podcast names
podcast_embeddings = bert_model.encode(podcast_names, convert_to_tensor=True)

def search_podcasts(query, top_k=5):
    query_embedding = bert_model.encode([query], convert_to_tensor=True)
    cosine_scores = cosine_similarity(query_embedding, podcast_embeddings)

    # Get the top_k similar podcasts
    top_k_indices = np.argsort(cosine_scores[0])[-top_k:][::-1]

    # Retrieve the corresponding podcast names
    similar_podcasts = [{
        'Name': podcast_names[idx],
        'Description': podcast_descriptions[idx],
        'Publisher': podcast_publishers[idx],
        'Spotify URL': podcast_spotify_urls[idx],
        'Cover Image URL': podcast_cover_image_urls[idx]
    } for idx in top_k_indices]

    return similar_podcasts



In [8]:
# Example search
query = "cook"
similar_podcasts = search_podcasts(query)
print(similar_podcasts)

[{'Name': 'Creepy Cooking Staff', 'Description': 'You can’t have a main course without getting a little experimental in the kitchen. The Creepy Cooking Staff is a podcast where UCA fixture Allen Chaney and his co-host Mike Macdee use the ‘ingredients’ supplied by listeners and their guests to try and make something tasty (or at the very least edible). Once a month Allen and Mike tackle the popular tropes and common story types of Creepypasta as suggested to them and try to brainstorm a story on the fly with the help of a guest. The results serve as a humorous glance at attempting to defy a genre in an attempt to improve it. Also fart jokes.', 'Publisher': 'Allen Chaney & Creative Horror', 'Spotify URL': 'https://open.spotify.com/show/7MdFr3sB77ifNwBBB7piaU', 'Cover Image URL': 'https://i.scdn.co/image/6562ccdff6aea29b94775336005bed48347dbd1e'}, {'Name': 'Creepy Cooking Staff', 'Description': 'You can’t have a main course without getting a little experimental in the kitchen. The Creepy 

In [9]:
model.save('model.h5')
print('Model saved succesfully')

Model saved succesfully
