Langkah 1: Preprocessing Teks

In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

# Fungsi untuk membersihkan teks
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\d+', '', text)  # Hapus angka
        text = re.sub(r'\s+', ' ', text)  # Hapus spasi berlebih
        text = text.lower()  # Ubah teks menjadi huruf kecil
        text = re.sub(r'[^\w\s]', '', text)  # Hapus tanda baca
        text = ' '.join([word for word in text.split() if word not in stop_words])  # Hapus stopwords
        return text
    return ''

# Load the dataset
data = pd.read_excel('data_podcast.xlsx', sheet_name='Sheet1')

# Preprocess the data
data['combined_features'] = data['Genre'].astype(str) + ' ' + data['Podcast Name'].astype(str) + ' ' + data['Description'].astype(str) + ' ' + data['Publisher'].astype(str)
data['combined_features'] = data['combined_features'].apply(clean_text)
data['combined_features'].fillna('', inplace=True)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Langkah 2: Tokenisasi dan Pembuatan Embedding Matrix

In [24]:
embedding_path = 'cc.id.300.vec'
embedding_path = '/path/to/your/embeddings/cc.id.300.vec'


In [25]:
from gensim.models import KeyedVectors

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['combined_features'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text data to sequences
sequences = tokenizer.texts_to_sequences(data['combined_features'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Load pretrained FastText embeddings
embedding_path = 'cc.id.300.vec'  # Path to FastText embeddings
word_vectors = KeyedVectors.load_word2vec_format(embedding_path)

# Create embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Free memory
del word_vectors

# Convert target variable to numeric
label_column = 'Genre'
labels = pd.get_dummies(data[label_column])


FileNotFoundError: [Errno 2] No such file or directory: 'cc.id.300.vec'

Langkah 3: Menggunakan Word Embeddings dan Model LSTM (Opsional)

In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load pre-trained word embeddings for Indonesian (e.g., from FastText)
# Here, we assume that `embedding_matrix` and `vocab_size` have been prepared

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['combined_features'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text data to sequences
sequences = tokenizer.texts_to_sequences(data['combined_features'])

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Convert target variable to numeric
label_column = 'Genre'
labels = pd.get_dummies(data[label_column])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the LSTM model
embedding_dim = 100
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    LSTM(128, return_sequences=False),
    Dense(64, activation='relu'),
    Dense(labels.shape[1], activation='softmax')  # Softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

def recommend_podcast(keyword, data, tokenizer, model, max_sequence_length):
    keyword = clean_text(keyword)
    keyword_sequence = tokenizer.texts_to_sequences([keyword])
    keyword_padded = pad_sequences(keyword_sequence, maxlen=max_sequence_length, padding='post')
    predictions = model.predict(keyword_padded)
    sorted_indices = np.argsort(predictions, axis=1)[:, ::-1]
    top_indices = sorted_indices[0, :5]
    for index in top_indices:
        print("Genre:", data.iloc[index]['Genre'])
        print("Podcast Name:", data.iloc[index]['Podcast Name'])
        print("Spotify URL:", data.iloc[index]['Spotify URL'])
        print()




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 3.886016845703125
Test Accuracy: 0.015909090638160706


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# Define the LSTM model
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    LSTM(64),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(labels.shape[1], activation='softmax')  # Softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


NameError: name 'embedding_matrix' is not defined

In [15]:
# Example usage
search_keyword = input("Masukkan kata kunci: ")
recommend_podcast(search_keyword, data, tokenizer, model, max_sequence_length)

Masukkan kata kunci: afhaf
Genre: arts and entertainment
Podcast Name: Tea & Strumpets: A Regency Romance Review
Spotify URL: https://open.spotify.com/show/3vIJiD1WQoEv09IRnqo91G

Genre: arts and entertainment
Podcast Name: The GenreVerse Podcast Network by LRM Online
Spotify URL: https://open.spotify.com/show/084cvmiweF4j7gAcN1KN8Q

Genre: arts and entertainment
Podcast Name: Hood Cash Radio: Podcast Edition
Spotify URL: https://open.spotify.com/show/40R8FBPo3zfvBUdyualJHd

Genre: arts and entertainment
Podcast Name: The Sword and Laser
Spotify URL: https://open.spotify.com/show/0VeoMYXPOgXBxSxJBPetRA

Genre: arts and entertainment
Podcast Name: Genre Geschehen
Spotify URL: https://open.spotify.com/show/7mF1YFf8oOKagn769dXrDc



In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# Define the LSTM model
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    LSTM(64),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(labels.shape[1], activation='softmax')  # Softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


NameError: name 'embedding_matrix' is not defined