In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import shuffle
from sentence_transformers import SentenceTransformer
from keras import backend as K
nltk.download('stopwords')


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\syari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Text Cleaning Function
def clean_text(text):
    """
    This function cleans text by removing punctuation, numbers, extra whitespace, 
    and stopwords.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [3]:
# Load and Preprocess Data 
dataset = pd.read_csv('../Data/podcasts_data.csv')
dataset = shuffle(dataset)  
dataset['Podcast Name'] = dataset['Podcast Name'].apply(clean_text)
dataset['Genre'] = dataset['Genre'].apply(clean_text)


In [4]:
# Drop rows with NaN values in 'Podcast Name' column
podcast_data = dataset.dropna(subset=['Podcast Name'])

In [5]:
# Extract relevant columns
podcast_names = podcast_data['Podcast Name'].values
podcast_descriptions = podcast_data['Description'].values
podcast_publishers = podcast_data['Publisher'].values
podcast_spotify_urls = podcast_data['Spotify URL'].values
podcast_cover_image_urls = podcast_data['Cover Image URL'].values

In [6]:
# Tokenization and Vectorization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(podcast_names)
sequences = tokenizer.texts_to_sequences(podcast_names)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1

In [7]:
# Sentence-BERT Embedding
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
podcast_embeddings = bert_model.encode(podcast_names, convert_to_tensor=True)



In [17]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 23, 128)           1153664   
                                                                 
 bidirectional_2 (Bidirectio  (None, 23, 128)          98816     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 23, 128)           0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 23, 128)          98816     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 23, 128)           0         
                                                                 
 time_distributed_1 (TimeDis  (None, 23, 9013)        

In [18]:
# Prepare labels to match the output shape of the model
labels = np.expand_dims(padded_sequences, axis=-1)

# Train the model
model.fit(padded_sequences, labels, epochs=30, batch_size=128, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x2a283839b10>

In [19]:
model.save('model.h5')
print('Model saved succesfully')


Model saved succesfully
