In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset
data = pd.read_csv('../Data/podcasts_data.csv')
data = data.iloc[:, :2]
data.head()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Cleaning
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Remove leading and trailing whitespace
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stop words
    return text

data['Podcast Name'] = data['Podcast Name'].apply(clean_text)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Podcast Name'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['Podcast Name'])

# Pad the sequences to ensure uniform input size
maxlen = 50  # Choose an appropriate maximum length for padding
X = pad_sequences(sequences, maxlen=maxlen)

# Encode the labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Genre'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


# Define the model
vocab_size = len(tokenizer.word_index) + 1  # Size of the vocabulary
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    LSTM(128),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy*100:.2f}%')


In [None]:
# Function to predict the genre of a given podcast name
def predict_genre(podcast_name):
    sequence = tokenizer.texts_to_sequences([podcast_name])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen)
    prediction = model.predict(padded_sequence)
    genre = label_encoder.inverse_transform([prediction.argmax()])[0]
    return genre

# Example usage
print(predict_genre("Career"))