In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

# Download the stop words corpus if necessary
nltk.download('stopwords')

# Define a function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Load the dataset into a pandas DataFrame
df = pd.read_csv('podcasts.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df.head(5)


Unnamed: 0.1,Unnamed: 0,title,producer,text,ID
0,0,History Hyenas with Chris Distefano and Yannis...,RiotCast Network,History Hyenas with Chris Distefano and Yannis...,0
1,1,Curiosity Daily,Westwood One,Curiosity Daily Westwood One Education The awa...,1
2,2,Spirits,Multitude,Spirits Multitude History A boozy weekly podca...,2
3,3,The Soundtrack Show,iHeartRadio,The Soundtrack Show iHeartRadio TV & Film The ...,3
4,4,Writing Excuses,"Brandon Sanderson, Mary Robinette Kowal, Dan W...","Writing Excuses Brandon Sanderson, Mary Robine...",4


In [5]:


# Apply the remove_stop_words() function to the description column
df['text'] = df['text'].apply(remove_stop_words)

# Display the updated DataFrame
df['text'] = df['text'].str.lower()

import string
df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))


df['text'] = df['text'].str.replace('\d+', '')



df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.replace('\s+', ' ')



# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))

# def remove_stopwords(text):
#     return " ".join([word for word in str(text).split() if word not in stop_words])

# df['description'] = df['description'].apply(remove_stopwords)




from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df['text'] = df['text'].apply(stem_words)


print(df.head())


  df['text'] = df['text'].str.replace('\d+', '')
  df['text'] = df['text'].str.replace('\s+', ' ')


   Unnamed: 0                                              title  \
0           0  History Hyenas with Chris Distefano and Yannis...   
1           1                                    Curiosity Daily   
2           2                                            Spirits   
3           3                                The Soundtrack Show   
4           4                                    Writing Excuses   

                                            producer  \
0                                   RiotCast Network   
1                                       Westwood One   
2                                          Multitude   
3                                        iHeartRadio   
4  Brandon Sanderson, Mary Robinette Kowal, Dan W...   

                                                text  ID  
0  histori hyena chri distefano yanni pappa riotc...   0  
1  curios daili westwood one educ awardwin curios...   1  
2  spirit multitud histori boozi weekli podcast m...   2  
3  soundtrack show

In [6]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances


# Convert the preprocessed descriptions into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

# Train a KMeans clustering model
kmeans = KMeans(n_clusters=100, random_state=42)
kmeans.fit(X)

# Assign each podcast episode to a cluster
cluster_labels = kmeans.predict(X)

# Recommend episodes from the same cluster as a given podcast name
def recommend_episodes(podcast_name, n_recommendations=5):
    # Find the indices of all episodes of the given podcast name
    podcast_episodes = df.index[df['title'] == podcast_name].tolist()
    
    # Find the cluster labels of all episodes of the given podcast name
    podcast_clusters = [cluster_labels[episode_id] for episode_id in podcast_episodes]
    
    # Find the indices of all episodes in the same clusters as the given podcast
    cluster_episodes = []
    for cluster_label in set(podcast_clusters):
        cluster_indices = np.where(cluster_labels == cluster_label)[0]
        cluster_episodes.extend(cluster_indices)
    
    # Remove the indices of the episodes of the given podcast from the list of cluster episodes
    for episode_id in podcast_episodes:
        if episode_id in cluster_episodes:
            cluster_episodes.remove(episode_id)
    
    # Calculate cosine similarity between each recommended podcast and the given podcast
    similarities = []
    for episode_id in cluster_episodes:
        similarity = pairwise_distances(X[episode_id], X[df.index[df['title'] == podcast_name]][0], metric='cosine')
        similarities.append(similarity[0][0])
    
    # Choose a random set of recommendations from the same clusters, sorted by cosine similarity
    recommendations = [x for _, x in sorted(zip(similarities, cluster_episodes))]
    recommendations = recommendations[:min(n_recommendations, len(recommendations))]
    
    return recommendations

# Calculate Jaccard similarity between recommended podcasts and given podcast
def jaccard_similarity(podcast1, podcast2):
    # Convert description of each podcast to a set of words
    set1 = set(podcast1.split())
    set2 = set(podcast2.split())
    
    # Calculate Jaccard similarity between the sets of words
    similarity = len(set1.intersection(set2)) / len(set1.union(set2))
    
    return similarity

# Example usage: recommend 5 episodes similar to "The Joe Rogan Experience" and calculate cosine and Jaccard similarity
podcast_name = "The Soundtrack Show"
n_recommendations = 5
recommendations = recommend_episodes(podcast_name, n_recommendations=n_recommendations)
print("Recommendations for", podcast_name + ":")

for i, episode_id in enumerate(recommendations):
    podcast = df.loc[episode_id, 'title']
    cosine_similarity = 1 - pairwise_distances(X[episode_id], X[df.index[df['title'] == podcast_name]][0], metric='cosine')
    jaccard_sim = jaccard_similarity(df.loc[df['title'] == podcast_name, 'text'].values[0], 
                                     df.loc[df['title'] == podcast, 'text'].values[0])
    print(i+1, podcast, "(Cosine similarity:", cosine_similarity[0][0], ", Jaccard similarity:", jaccard_sim, ")")
      




Recommendations for The Soundtrack Show:
1 Before Breakfast (Cosine similarity: 0.2998134387370601 , Jaccard similarity: 0.05027932960893855 )
2 Finding Fred (Cosine similarity: 0.2517018572138612 , Jaccard similarity: 0.05789473684210526 )
3 Daniel and Jorge Explain the Universe (Cosine similarity: 0.25161616164595535 , Jaccard similarity: 0.05641025641025641 )
4 Daily Inspiration – The Steve Harvey Morning Show (Cosine similarity: 0.24636977757121326 , Jaccard similarity: 0.045454545454545456 )
5 Ephemeral (Cosine similarity: 0.24575547913677287 , Jaccard similarity: 0.03879310344827586 )


In [7]:


# Recommend episodes with high cosine similarity from the same cluster as a given podcast name
def recommend_episodes(podcast_name, n_recommendations=5):
    # Find the indices of all episodes of the given podcast name
    podcast_episodes = df.index[df['title'] == podcast_name].tolist()
    
    # Find the cluster label of the given podcast
    podcast_cluster = kmeans.predict(X[podcast_episodes])[0]
    
    # Find the indices of all episodes in the same cluster as the given podcast
    cluster_episodes = np.where(kmeans.labels_ == podcast_cluster)[0]
    
    # Remove the indices of the episodes of the given podcast from the list of cluster episodes
    cluster_episodes = np.setdiff1d(cluster_episodes, podcast_episodes)
    
    # Calculate cosine similarity between each recommended podcast and the given podcast
    similarities = []
    for episode_id in cluster_episodes:
        similarity = pairwise_distances(X[episode_id], X[podcast_episodes], metric='cosine')
        similarities.append(similarity[0][0])
    
    # Choose a set of recommendations with high cosine similarity
    recommendations = [x for _, x in sorted(zip(similarities, cluster_episodes), reverse=True)]
    recommendations = recommendations[:min(n_recommendations, len(recommendations))]
    
    return recommendations

# Example usage: recommend 5 episodes with high cosine similarity to "The Video Games Show"
podcast_name = "The Soundtrack Show"
n_recommendations = 5
recommendations = recommend_episodes(podcast_name, n_recommendations=n_recommendations)
print("Recommendations for", podcast_name + ":")

for i, episode_id in enumerate(recommendations):
    podcast = df.loc[episode_id, 'title']
    cosine_similarity = 1 - pairwise_distances(X[episode_id], X[df.index[df['title'] == podcast_name]], metric='cosine')
    print(i+1, podcast, "(Cosine similarity:", cosine_similarity[0][0], ")")

Recommendations for The Soundtrack Show:
1 Creature Feature (Cosine similarity: 0.08897387737806994 )
2 Food 4 Thot (Cosine similarity: 0.09781931429314072 )
3 Who Is? (Cosine similarity: 0.10685613187782672 )
4 The Bechdel Cast (Cosine similarity: 0.11129595882731103 )
5 Modern Ruhles with Stephanie Ruhle: Compelling Conversations in Culturally Complicated Times (Cosine similarity: 0.11406852311617088 )


In [10]:
from sklearn.cluster import AgglomerativeClustering


# Train a hierarchical clustering model
agg_clustering = AgglomerativeClustering(n_clusters=10)
agg_clustering.fit(X.toarray())

# Assign each podcast episode to a cluster
cluster_labels = agg_clustering.labels_

# Recommend episodes from the same cluster as a given podcast name
def recommend_episodes(podcast_name, n_recommendations=5):
    # Find the indices of all episodes of the given podcast name
    podcast_episodes = df.index[df['title'] == podcast_name].tolist()
    
    # Find the cluster labels of all episodes of the given podcast name
    podcast_clusters = [cluster_labels[episode_id] for episode_id in podcast_episodes]
    
    # Find the indices of all episodes in the same clusters as the given podcast
    cluster_episodes = []
    for cluster_label in set(podcast_clusters):
        cluster_indices = np.where(cluster_labels == cluster_label)[0]
        cluster_episodes.extend(cluster_indices)
    
    # Remove the indices of the episodes of the given podcast from the list of cluster episodes
    for episode_id in podcast_episodes:
        if episode_id in cluster_episodes:
            cluster_episodes.remove(episode_id)
    
    # Choose a random set of recommendations from the same clusters
    recommendations = np.random.choice(cluster_episodes, n_recommendations, replace=False)
    
    return recommendations

# Example usage: recommend 5 episodes similar to "The Video Games Show"
recommendations = recommend_episodes("The Soundtrack Show", n_recommendations=5)
print("Recommendations for 'The Soundtrack Show':", df.loc[recommendations, 'title'])


Recommendations for 'The Soundtrack Show': 1734                   Behind the Bastards
1403                            Happy Face
3068                  It Could Happen Here
609     Velvet’s Edge with Kelly Henderson
83                Stuff Mom Never Told You
Name: title, dtype: object


In [13]:
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity

# Train an Affinity Propagation clustering model
aff_clustering = AffinityPropagation(damping=0.5)
aff_clustering.fit(X.toarray())

# Assign each podcast episode to a cluster
cluster_labels = aff_clustering.labels_

# Recommend episodes from the same cluster as a given podcast name
def recommend_episodes(podcast_name, n_recommendations=5):
    # Find the indices of all episodes of the given podcast name
    podcast_episodes = df.index[df['title'] == podcast_name].tolist()

    # Find the cluster labels of all episodes of the given podcast name
    podcast_clusters = [cluster_labels[episode_id] for episode_id in podcast_episodes]

    # Find the indices of all episodes in the same clusters as the given podcast
    cluster_episodes = []
    for cluster_label in set(podcast_clusters):
        cluster_indices = np.where(cluster_labels == cluster_label)[0]
        cluster_episodes.extend(cluster_indices)

    # Remove the indices of the episodes of the given podcast from the list of cluster episodes
    for episode_id in podcast_episodes:
        if episode_id in cluster_episodes:
            cluster_episodes.remove(episode_id)

    # Calculate the pairwise cosine similarity between the selected episodes and the target episode
    similarity_scores = cosine_similarity(X[cluster_episodes], X[podcast_episodes]).flatten()

    # Choose the top recommendations based on the cosine similarity
    recommendations = np.argsort(-similarity_scores)[:n_recommendations]

    return np.array(cluster_episodes).ravel()[recommendations]


In [18]:
recommendations = recommend_episodes("The Soundtrack Show", n_recommendations=5)
# print("Recommendations for 'The Soundtrack Show':", df.loc[recommendations, 'title'])


podcast_episodes = df.index[df['title'] == "The Soundtrack Show"].tolist()
podcast_vector = X[podcast_episodes].toarray()[0]
for episode_id in recommendations:
    episode_vector = X[episode_id].toarray()[0]
    similarity_score = cosine_similarity([podcast_vector], [episode_vector])[0][0]
    print(f"Cosine similarity of '{df.loc[episode_id, 'title']}': {similarity_score}")

Cosine similarity of 'Before Breakfast': 0.29981343873706007
Cosine similarity of 'Finding Fred': 0.251701857213861
Cosine similarity of 'Daniel and Jorge Explain the Universe': 0.25161616164595524
Cosine similarity of 'Daily Inspiration – The Steve Harvey Morning Show': 0.24636977757121312
Cosine similarity of 'Ephemeral': 0.24575547913677284
