In [39]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import sklearn
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from keras import backend as K
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

K.clear_session()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [41]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [42]:
# Load the dataset
file_path = 'podcasts_data.csv'
dataset = pd.read_csv(file_path)
dataset.head()

Unnamed: 0,Genre,Podcast Name,Description,Publisher,Total Episodes,Spotify URL,Cover Image URL
0,arts and entertainment,Easy Stories in English,"Learning a language is hard, but Easy Stories ...","Ariel Goodbody, Polyglot English Teacher & Gla...",216,https://open.spotify.com/show/23zdIqNUb0riR51w...,https://i.scdn.co/image/ab6765630000ba8a767693...
1,arts and entertainment,Podcast Buku Kutu,"EPISODE BARU SETIAP SENIN, RABU, dan JUMAT -- ...",Aditya Hadi - PODLUCK,162,https://open.spotify.com/show/3w5zKrbQ6kgB0RKI...,https://i.scdn.co/image/ab6765630000ba8a04fa1a...
2,arts and entertainment,Underwood and Flinch and Other Audiobooks by M...,Underwood and Flinch is a three-time Parsec aw...,Mike Bennett,244,https://open.spotify.com/show/3VwIE3bG0zpTCNzR...,https://i.scdn.co/image/ab6765630000ba8a4e7b42...
3,arts and entertainment,Podcast Resensi Buku,Kumpulan resensi beragam buku berbagai genre d...,Podcast Resensi Buku - PODLUCK,264,https://open.spotify.com/show/6woLsDl6CSntzeWU...,https://i.scdn.co/image/ab6765630000ba8a1e97ef...
4,arts and entertainment,SupremeMasterTV,Supreme Master Television is an international ...,SupremeMasterTV,500,https://open.spotify.com/show/5bCgERRINgZWhauS...,https://i.scdn.co/image/ab6765630000ba8a7899e5...


In [43]:
# Cleaning
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Remove leading and trailing whitespace
    stop_words = set(stopwords.words('english', 'indonesian'))
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stop words
    return text

dataset['Podcast Name'] = dataset['Podcast Name'].apply(clean_text)
dataset['Genre'] = dataset['Genre'].apply(clean_text)

In [44]:
# Drop rows with NaN values in 'Podcast Name' column
podcast_data = dataset.dropna(subset=['Podcast Name'])

# Extract podcast names
podcast_names = podcast_data['Podcast Name'].values

# Extract relevant columns
podcast_names = podcast_data['Podcast Name'].values
podcast_genres = podcast_data['Genre'].values
podcast_descriptions = podcast_data['Description'].values
podcast_publishers = podcast_data['Publisher'].values
podcast_spotify_urls = podcast_data['Spotify URL'].values
podcast_cover_image_urls = podcast_data['Cover Image URL'].values

In [45]:
# Tokenization and Vectorization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(podcast_names)

# Convert podcast names to sequences of integers
sequences = tokenizer.texts_to_sequences(podcast_names)

# Pad sequences to have the same length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Get the vocabulary size for the embedding layer
vocab_size = len(tokenizer.word_index) + 1


In [46]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Dense(128, activation = 'relu'),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 128)           1153664   
                                                                 
 bidirectional (Bidirectiona  (None, 23, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 23, 128)          98816     
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 23, 128)           0         
                                                                 
 dense (Dense)               (None, 23, 128)           16512     
                                                                 
 time_distributed (TimeDistr  (None, 23, 9013)         1

In [47]:
# Prepare labels to match the output shape of the model
labels = np.expand_dims(padded_sequences, axis=-1)

# Train the model
model.fit(padded_sequences, labels, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x265a9116c50>

In [55]:
def get_podcast_embeddings(model, data, batch_size=1048):
    embeddings = []
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i + batch_size]
        batch_embeddings = model.predict(batch_data)
        batch_embeddings = batch_embeddings.reshape(batch_embeddings.shape[0], -1)
        embeddings.append(batch_embeddings)
    embeddings = np.vstack(embeddings)
    return embeddings

# Prepare the embeddings for the podcasts
podcast_embeddings = get_podcast_embeddings(model, padded_sequences)



In [56]:
def search_podcasts(query, top_k=5):
    # Tokenize and pad the query
    query_sequence = tokenizer.texts_to_sequences([query])
    query_padded = pad_sequences(query_sequence, maxlen=max_length, padding='post')

    # Encode the query using the trained model
    query_embedding = model.predict(query_padded)
    query_embedding = query_embedding.reshape(1, -1)
    cosine_scores = cosine_similarity(query_embedding, podcast_embeddings)

    # Get the top_k similar podcasts
    top_k_indices = np.argsort(cosine_scores[0])[-top_k:][::-1]

    # Retrieve the corresponding podcast names
    similar_podcasts = [{
        'Name': podcast_names[idx],
        'Genre': podcast_genres[idx],
        #'Description': podcast_descriptions[idx],
        'Publisher': podcast_publishers[idx],
        'Spotify URL': podcast_spotify_urls[idx],
        'Cover Image URL': podcast_cover_image_urls[idx]
    } for idx in top_k_indices]

    return similar_podcasts

In [58]:
# Example search
query = "musik"
similar_podcasts = search_podcasts(query)
print(similar_podcasts)

[{'Name': 'h', 'Genre': 'comedy', 'Publisher': '14H14', 'Spotify URL': 'https://open.spotify.com/show/2Bm4iMJR4GFpLaR9eTV3jJ', 'Cover Image URL': 'https://i.scdn.co/image/85ac2c94a91a0c0cc99a43b3110098afba8e5045'}, {'Name': 'dmold', 'Genre': 'games', 'Publisher': '2DMOld', 'Spotify URL': 'https://open.spotify.com/show/7bdlE1ubxTsV3lmCibQicH', 'Cover Image URL': 'https://i.scdn.co/image/ab6765630000ba8a6982f8da85982ff104e0428d'}, {'Name': 'lingo', 'Genre': 'business', 'Publisher': 'The Lingo', 'Spotify URL': 'https://open.spotify.com/show/3H6cDSrjo2rGrp9saGaX46', 'Cover Image URL': 'https://i.scdn.co/image/7a795c7afe5740355040d5f81d38998432c7a84e'}, {'Name': 'exit', 'Genre': 'comedy', 'Publisher': "Stick'n'Poke Productions", 'Spotify URL': 'https://open.spotify.com/show/7adHI2N3DNDXvQTeCWWZhQ', 'Cover Image URL': 'https://i.scdn.co/image/fab70837c0cf3162e090ec6383e65b01df0c6f21'}, {'Name': 'jancukers', 'Genre': 'language', 'Publisher': 'jembleng edan', 'Spotify URL': 'https://open.spoti

In [59]:
model.save('model.h5')

In [61]:
from keras.models import load_model
import json
model = load_model("model.h5")

In [62]:
model_json = model.to_json()

In [63]:
with open("model.json", "w") as json_file:
    json_file.write(model_json)