In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [6]:
data = pd.read_csv('../Data/podcasts_data.csv')
data.head()

Unnamed: 0,Genre,Podcast Name,Description,Publisher,Total Episodes,Spotify URL,Cover Image URL
0,arts and entertainment,Easy Stories in English,"Learning a language is hard, but Easy Stories ...","Ariel Goodbody, Polyglot English Teacher & Gla...",216,https://open.spotify.com/show/23zdIqNUb0riR51w...,https://i.scdn.co/image/ab6765630000ba8a767693...
1,arts and entertainment,Podcast Buku Kutu,"EPISODE BARU SETIAP SENIN, RABU, dan JUMAT -- ...",Aditya Hadi - PODLUCK,162,https://open.spotify.com/show/3w5zKrbQ6kgB0RKI...,https://i.scdn.co/image/ab6765630000ba8a04fa1a...
2,arts and entertainment,Underwood and Flinch and Other Audiobooks by M...,Underwood and Flinch is a three-time Parsec aw...,Mike Bennett,244,https://open.spotify.com/show/3VwIE3bG0zpTCNzR...,https://i.scdn.co/image/ab6765630000ba8a4e7b42...
3,arts and entertainment,Podcast Resensi Buku,Kumpulan resensi beragam buku berbagai genre d...,Podcast Resensi Buku - PODLUCK,264,https://open.spotify.com/show/6woLsDl6CSntzeWU...,https://i.scdn.co/image/ab6765630000ba8a1e97ef...
4,arts and entertainment,SupremeMasterTV,Supreme Master Television is an international ...,SupremeMasterTV,500,https://open.spotify.com/show/5bCgERRINgZWhauS...,https://i.scdn.co/image/ab6765630000ba8a7899e5...


In [7]:
# Cleaning
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Remove leading and trailing whitespace
    stop_words = set(stopwords.words('english', 'indonesian'))
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stop words
    return text

data['Podcast Name'] = data['Podcast Name'].apply(clean_text)
data['Genre'] = data['Genre'].apply(clean_text)

In [8]:
df = data[["Genre"]]
df 

Unnamed: 0,Genre
0,arts entertainment
1,arts entertainment
2,arts entertainment
3,arts entertainment
4,arts entertainment
...,...
16850,true crime
16851,true crime
16852,true crime
16853,true crime


In [9]:
tfidf = TfidfVectorizer()
combined_text = df['Genre'] 
combined_text

0        arts entertainment
1        arts entertainment
2        arts entertainment
3        arts entertainment
4        arts entertainment
                ...        
16850            true crime
16851            true crime
16852            true crime
16853            true crime
16854            true crime
Name: Genre, Length: 16855, dtype: object

In [10]:
# Menghitung nilai TF-IDF
tfidf_matrix = tfidf.fit_transform(combined_text)

# Konversi hasil menjadi DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [11]:
tfidf_df

Unnamed: 0,arts,baseball,basketball,beauty,books,boxing,business,careers,celebrities,comedy,...,selfcare,sex,soccer,sports,stories,technology,true,tv,video,wrestling
0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
16851,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
16852,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
16853,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0


In [12]:
# Assuming tfidf_matrix is already defined
similarity_matrix = cosine_similarity(tfidf_matrix)

with tf.device('/CPU:0'):
    similarity_tensor = tf.convert_to_tensor(similarity_matrix)

In [13]:
# Input Podcast Name
input_genre = "books"

# Get index of input Podcast
genre_index = data[data["Genre"] == input_genre].index[0]

# Calculate top_k podcast based on similarity using tf.nn.top_k
k = 5
with tf.device('/CPU:0'):
    top_k_values, top_k_indices = tf.nn.top_k(similarity_tensor[genre_index], k=k+1)

# Get top_k genre' information
top_k_genre = data.loc[top_k_indices.numpy()]

# Print the top_k genre
print(f"Top {k} Podcast Similar to genre '{input_genre}':")
for i, podcast in top_k_genre.iterrows():
    print("Podcast Name:", podcast["Podcast Name"])
    print("Genre:", podcast["Genre"])
    print("Description:", podcast["Description"])
    print("Publisher:", podcast["Publisher"])
    print("Podcast URL:", podcast["Spotify URL"])
    print("Cover Image:", podcast["Cover Image URL"])
    print()


Top 5 Podcast Similar to genre 'books':
Podcast Name: baca buku audiobook indonesia
Genre: books
Description: Berawal dari sebuah artikel yang menyatakan betapa rendahnya tingkat membaca di Indonesia. Maka saya membangun channel yang khusus menyajikan audiobook dalam bahasa Indonesia. Tujuan saya adalah agar kita semua mendapatkan kemudahan dalam hal belajar dan meningkatkan pengetahuan.  Ada ratusan buku best seller yang pernah saya baca dan tersedia dalam bentuk audiobook. Anda akan menemukan buku audio dengan berbagai genre yang berbeda. Mulai dari buku-buku tentang filsafat, pengembangan diri, novel, ilmu pengetahuan, spiritualitas, dan lain sebagainya.  Donation link: https://saweria.co/bacabuku Support this podcast: https://podcasters.spotify.com/pod/show/gunt/support
Publisher: Guntur Sulaksono
Podcast URL: https://open.spotify.com/show/1Gjoi8gXw7PchKe0fgJJKR
Cover Image: https://i.scdn.co/image/ab6765630000ba8a5d37cb4eb8e1296eed09bddb

Podcast Name: easy stories english
Genre: 

In [14]:
# Create DataFrame for the top_k places
df_output = pd.DataFrame(top_k_genre, columns=["Podcast Name", "Genre", "Description", "Publisher", "Spotify URL", "Cover Image URL"])

# Print the DataFrame
df_output

Unnamed: 0,Podcast Name,Genre,Description,Publisher,Spotify URL,Cover Image URL
1000,baca buku audiobook indonesia,books,Berawal dari sebuah artikel yang menyatakan be...,Guntur Sulaksono,https://open.spotify.com/show/1Gjoi8gXw7PchKe0...,https://i.scdn.co/image/ab6765630000ba8a5d37cb...
1001,easy stories english,books,"Learning a language is hard, but Easy Stories ...","Ariel Goodbody, Polyglot English Teacher & Gla...",https://open.spotify.com/show/23zdIqNUb0riR51w...,https://i.scdn.co/image/ab6765630000ba8a767693...
1002,deep talk,books,"Mystery,Fiction, Non-fiction, Ron-com,Fantasy ...",Shafeeqa Maryam,https://open.spotify.com/show/5IjSotMcXVMSUXAg...,https://i.scdn.co/image/24ec7b88de4afbef892ece...
1003,podcast buku kutu,books,"EPISODE BARU SETIAP SENIN, RABU, dan JUMAT -- ...",Aditya Hadi - PODLUCK,https://open.spotify.com/show/3w5zKrbQ6kgB0RKI...,https://i.scdn.co/image/ab6765630000ba8a04fa1a...
1004,michas bookshelf,books,"Welcome to Micha's Bookshelf, the podcast wher...",Micha’s Bookshelf,https://open.spotify.com/show/4yhSv2EJz3DEMrvB...,https://i.scdn.co/image/ab6765630000ba8a26fac8...
1005,books boba,books,Books & Boba is a book club dedicated to books...,Potluck Podcast Collective,https://open.spotify.com/show/6A3AYFQLggeEIEQq...,https://i.scdn.co/image/ab6765630000ba8a9c5412...
