In [1]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import sklearn
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from keras import backend as K
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

K.clear_session()

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
tf.config.list_physical_devices('GPU') 

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [19]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
        
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
# Load the dataset
file_path = '../Data/podcasts_data.csv'
dataset = pd.read_csv(file_path)
dataset.head()

Unnamed: 0,Genre,Podcast Name,Description,Publisher,Total Episodes,Spotify URL,Cover Image URL
0,arts and entertainment,Easy Stories in English,"Learning a language is hard, but Easy Stories ...","Ariel Goodbody, Polyglot English Teacher & Gla...",216,https://open.spotify.com/show/23zdIqNUb0riR51w...,https://i.scdn.co/image/ab6765630000ba8a767693...
1,arts and entertainment,Podcast Buku Kutu,"EPISODE BARU SETIAP SENIN, RABU, dan JUMAT -- ...",Aditya Hadi - PODLUCK,162,https://open.spotify.com/show/3w5zKrbQ6kgB0RKI...,https://i.scdn.co/image/ab6765630000ba8a04fa1a...
2,arts and entertainment,Underwood and Flinch and Other Audiobooks by M...,Underwood and Flinch is a three-time Parsec aw...,Mike Bennett,244,https://open.spotify.com/show/3VwIE3bG0zpTCNzR...,https://i.scdn.co/image/ab6765630000ba8a4e7b42...
3,arts and entertainment,Podcast Resensi Buku,Kumpulan resensi beragam buku berbagai genre d...,Podcast Resensi Buku - PODLUCK,264,https://open.spotify.com/show/6woLsDl6CSntzeWU...,https://i.scdn.co/image/ab6765630000ba8a1e97ef...
4,arts and entertainment,SupremeMasterTV,Supreme Master Television is an international ...,SupremeMasterTV,500,https://open.spotify.com/show/5bCgERRINgZWhauS...,https://i.scdn.co/image/ab6765630000ba8a7899e5...


In [6]:
# Acak urutan baris
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [8]:
# Ensure you have the stopwords for both English and Indonesian
stop_words = set(stopwords.words('english')).union(set(stopwords.words('indonesian')))

# Cleaning 'Podcast Name' column
dataset['Podcast Name'] = dataset['Podcast Name'].str.lower()  # Convert to lowercase
dataset['Podcast Name'] = dataset['Podcast Name'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
dataset['Podcast Name'] = dataset['Podcast Name'].str.replace(r'\d+', '', regex=True)  # Remove numbers
dataset['Podcast Name'] = dataset['Podcast Name'].str.replace(r'\s+', ' ', regex=True)  # Remove extra whitespace
dataset['Podcast Name'] = dataset['Podcast Name'].str.strip()  # Remove leading and trailing whitespace
dataset['Podcast Name'] = dataset['Podcast Name'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))  # Remove stop words

# Cleaning 'Genre' column
dataset['Genre'] = dataset['Genre'].str.lower()  # Convert to lowercase
dataset['Genre'] = dataset['Genre'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
dataset['Genre'] = dataset['Genre'].str.replace(r'\d+', '', regex=True)  # Remove numbers
dataset['Genre'] = dataset['Genre'].str.replace(r'\s+', ' ', regex=True)  # Remove extra whitespace
dataset['Genre'] = dataset['Genre'].str.strip()  # Remove leading and trailing whitespace
dataset['Genre'] = dataset['Genre'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))  # Remove stop words

In [9]:
# Drop rows with NaN values in 'Podcast Name' column
podcast_data = dataset.dropna(subset=['Podcast Name'])

# Extract podcast names
podcast_names = podcast_data['Podcast Name'].values

# Extract relevant columns
podcast_names = podcast_data['Podcast Name'].values
podcast_genres = podcast_data['Genre'].values
podcast_descriptions = podcast_data['Description'].values
podcast_publishers = podcast_data['Publisher'].values
podcast_spotify_urls = podcast_data['Spotify URL'].values
podcast_cover_image_urls = podcast_data['Cover Image URL'].values

In [10]:
# Tokenization and Vectorization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(podcast_names)

# Convert podcast names to sequences of integers
sequences = tokenizer.texts_to_sequences(podcast_names)

# Pad sequences to have the same length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Get the vocabulary size for the embedding layer
vocab_size = len(tokenizer.word_index) + 1


In [11]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, padded_sequences, test_size=0.2, random_state=42)


In [12]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Dense(512, activation = 'relu'),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 128)           1150208   
                                                                 
 bidirectional (Bidirectiona  (None, 23, 256)          263168    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 23, 128)          164352    
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 23, 128)           0         
                                                                 
 dense (Dense)               (None, 23, 512)           66048     
                                                                 
 time_distributed (TimeDistr  (None, 23, 8986)         4

In [13]:
# Prepare labels to match the output shape of the model
labels = np.expand_dims(padded_sequences, axis=-1)

# Train the model
model.fit(padded_sequences, labels, epochs=20, batch_size=128, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1bb27df9b70>

In [27]:

import tensorflow as tf

In [28]:
# Predict on the test set
y_pred_test = model.predict(X_test)
y_pred_test_reshaped = y_pred_test.reshape(y_pred_test.shape[0], -1)

ResourceExhaustedError: Graph execution error:

Detected at node 'sequential/time_distributed/dense_1/Softmax' defined at (most recent call last):
    File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell
      result = self._run_cell(
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell
      result = runner(coro)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\ACER\AppData\Local\Temp\ipykernel_21244\3067214628.py", line 2, in <module>
      y_pred_test = model.predict(X_test)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\training.py", line 2253, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\training.py", line 2041, in predict_function
      return step_function(self, iterator)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\training.py", line 2027, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\training.py", line 2015, in run_step
      outputs = model.predict_step(data)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
      return self(x, training=False)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\sequential.py", line 410, in call
      return super().call(inputs, training=training, mask=mask)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\layers\rnn\time_distributed.py", line 252, in call
      y = self.layer(inputs, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\layers\core\dense.py", line 255, in call
      outputs = self.activation(outputs)
    File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\keras\activations.py", line 84, in softmax
      output = tf.nn.softmax(x, axis=axis)
Node: 'sequential/time_distributed/dense_1/Softmax'
OOM when allocating tensor with shape[736,8986] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node sequential/time_distributed/dense_1/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_predict_function_25471]

In [16]:
model.save('model.h5')
print('Model saved succesfully')

Model saved succesfully


In [19]:
#Run on terminal!
!tensorflowjs_converter --input_format=keras --output_format=tfjs_layers_model "D:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\Capstone-project-podpicks\Code\model.h5" "D:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\Capstone-project-podpicks\Code\tfjs_model"

2024-06-10 16:46:40.807131: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
usage: TensorFlow.js model converters. [-h]
                                       [--input_format {keras_saved_model,tf_hub,tf_saved_model,keras,tf_frozen_model,tfjs_layers_model}]
                                       [--output_format {keras_saved_model,tfjs_graph_model,keras,tfjs_layers_model}]
                                       [--signature_name SIGNATURE_NAME]
                                       [--saved_model_tags SAVED_MODEL_TAGS]
                                       [--quantize_float16 [QUANTIZE_FLOAT16]]
                                       [--quantize_uint8 [QUANTIZE_UINT8]]
                                       [--quantize_uint16 [QUANTIZE_UINT16]]
                                       [--quantization_bytes {1,2}]
                                       [--split_weights_by_layer] [--ver

In [15]:
def get_podcast_embeddings(model, data, batch_size=512):
    embeddings = []
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i + batch_size]
        batch_embeddings = model.predict(batch_data)
        batch_embeddings = batch_embeddings.reshape(batch_embeddings.shape[0], -1)
        embeddings.append(batch_embeddings)
    embeddings = np.vstack(embeddings)
    return embeddings

# Prepare the embeddings for the podcasts
podcast_embeddings = get_podcast_embeddings(model, padded_sequences)



KeyboardInterrupt: 

In [None]:
def search_podcasts(query, top_k=5):
    # Tokenize and pad the query
    query_sequence = tokenizer.texts_to_sequences([query])
    query_padded = pad_sequences(query_sequence, maxlen=max_length, padding='post')

    # Encode the query using the trained model
    query_embedding = model.predict(query_padded)
    query_embedding = query_embedding.reshape(1, -1)
    cosine_scores = cosine_similarity(query_embedding, podcast_embeddings)

    # Get the top_k similar podcasts
    top_k_indices = np.argsort(cosine_scores[0])[-top_k:][::-1]

    # Retrieve the corresponding podcast names
    similar_podcasts = [{
        'Name': podcast_names[idx],
        'Genre': podcast_genres[idx],
        #'Description': podcast_descriptions[idx],
        'Publisher': podcast_publishers[idx],
        'Spotify URL': podcast_spotify_urls[idx],
        'Cover Image URL': podcast_cover_image_urls[idx]
    } for idx in top_k_indices]

    return similar_podcasts

In [None]:
# Example search
query = "games"
similar_podcasts = search_podcasts(query)
print(similar_podcasts)

[{'Name': 'epiphantastic', 'Genre': 'health', 'Publisher': 'Shrestha S Bharadwaj', 'Spotify URL': 'https://open.spotify.com/show/6Jyd77b4lV8hQ2zqBdFEQf', 'Cover Image URL': 'https://i.scdn.co/image/ab6765630000ba8ac105f22d2145df57bb060798'}, {'Name': 'epiphantastic', 'Genre': 'lifestyle', 'Publisher': 'Shrestha S Bharadwaj', 'Spotify URL': 'https://open.spotify.com/show/6Jyd77b4lV8hQ2zqBdFEQf', 'Cover Image URL': 'https://i.scdn.co/image/ab6765630000ba8ac105f22d2145df57bb060798'}, {'Name': 'epiphantastic', 'Genre': 'books', 'Publisher': 'Shrestha S Bharadwaj', 'Spotify URL': 'https://open.spotify.com/show/6Jyd77b4lV8hQ2zqBdFEQf', 'Cover Image URL': 'https://i.scdn.co/image/ab6765630000ba8ac105f22d2145df57bb060798'}, {'Name': 'epiphantastic', 'Genre': 'selfcare', 'Publisher': 'Shrestha S Bharadwaj', 'Spotify URL': 'https://open.spotify.com/show/6Jyd77b4lV8hQ2zqBdFEQf', 'Cover Image URL': 'https://i.scdn.co/image/ab6765630000ba8ac105f22d2145df57bb060798'}, {'Name': 'joystix', 'Genre': 'v

In [None]:
model.save('model.h5')
print('Model saved succesfully')

Model saved succesfully


In [None]:
#Run on terminal!
!tensorflowjs_converter --input_format=keras --output_format=tfjs_layers_model "D:\PodPicks\Code\model.h5" "D:\PodPicks\Code\tfjs_model"

^C


2024-06-10 15:57:27.160703: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2024-06-10 15:57:27.161587: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Traceback (most recent call last):
  File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\ACER\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\Scripts\tensorflowjs_converter.exe\__main__.py", line 4, in <module>
  File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRODUCT\.venv\lib\site-packages\tensorflowjs\__init__.py", line 21, in <module>
    from tensorflowjs import converters
  File "d:\Kuliah Stuff\Kuliah SMT 6\Capstone\PRO