In [2]:
import re
import nltk
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from stop_words import get_stop_words

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/yamini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yamini/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data Pre-Processing

In [29]:
podcasts_df = pd.read_pickle('../data/pickle_files/english_podcasts_detailed_cleaned.pkl')

In [30]:
podcasts_df['text'] = podcasts_df[['title', 'producer', 'genre', 'description', 'episode_titles', 'episode_descriptions']].apply(lambda x: ' '.join(x), axis=1)
podcasts_df = podcasts_df.drop(columns=['genre', 'description', 'num_episodes', 'rating', 'num_reviews', 'link', 'episode_titles', 'episode_descriptions'])
podcasts_df['ID'] = list(range(podcasts_df.shape[0]))

In [34]:
# utils
stop = get_stop_words('en')
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]

tokenizer = RegexpTokenizer(r'\w+')

In [35]:
def remove_stop(text, stop):
    return [word for word in text if word not in stop ]

def lemmatize(text, l_stemmer):
    return [l_stemmer.lemmatize(word) for word in text]

In [32]:
def preprocess_text(text):
    # remove mixed alphanumeric
    text = re.sub(r"""(?x) \b(?=\w*\d)\w+\s*""","", text)
    re.sub(r"http\S+", "", text)
    text = re.sub(r'([^\s\w]|_)+', '', text)
    text = tokenizer.tokenize(text.lower())
    text = remove_stop(text, stop)
    text = lemmatize(text, WordNetLemmatizer())
    
    new_text = ' '.join(text)
    return new_text

In [33]:
podcasts_df['text'] = podcasts_df['text'].map(preprocess_text)
podcasts_df = podcasts_df.query('podcasts_df.text !=""')

In [10]:
podcasts_df.head()

Unnamed: 0,title,producer,text,ID
0,History Hyenas with Chris Distefano and Yannis...,RiotCast Network,"[history, hyena, chris, distefano, yannis, pap...",0
1,Curiosity Daily,Westwood One,"[curiosity, daily, westwood, one, education, a...",1
2,Spirits,Multitude,"[spirit, multitude, history, boozy, mythology,...",2
3,The Soundtrack Show,iHeartRadio,"[soundtrack, show, iheartradio, tv, film, soun...",3
4,Writing Excuses,"Brandon Sanderson, Mary Robinette Kowal, Dan W...","[writing, excuse, brandon, sanderson, mary, ko...",4


# Podcast Recommender utils

In [40]:
# helper functions
def get_title_from_index(index):
    return podcasts_df[podcasts_df.ID == index]["title"].values[0]

def get_index_from_title(title):
    return podcasts_df[podcasts_df.title == title]["ID"].values[0]

In [39]:
def recommend(podcast_title, sim_matrix, number_recs=5):
    podcast_id = get_index_from_title(podcast_title)
    similar_podcasts =  list(enumerate(sim_matrix[podcast_id]))
    sorted_similar_podcast = sorted(similar_podcasts,key=lambda x:x[1],reverse=True)
    
    recommendations = [get_title_from_index(sorted_similar_podcast[i][0]) for i in range(number_recs+2)]
    return recommendations[1:]

In [38]:
# Podcasts we'll use to validate results
sample_podcasts = ['The Daily', "Murder, etc.",'This American Life', 'Call Her Daddy', 'The Joe Rogan Experience']


# Bag of Words + Cosine Similarity

In [36]:
cv = CountVectorizer()
cv_matrix = cv.fit_transform(podcasts_df["text"])
cv_cosine_sim = cosine_similarity(cv_matrix)

In [44]:
for i in sample_podcasts:
    print("If you liked {}, try: ".format(i))
    recs = recommend(i, cv_cosine_sim)
    for i in recs:
        print("     {}".format(i))


Recommendations for The Daily: 
     Impeachment Inquiry: Updates from The Washington Post
     Impeachment: A Daily Podcast
     The Takeaway
     Article II: Inside Impeachment
     The Daily 202's Big Idea
     The 11th Hour with Brian Williams
Recommendations for Murder, etc.: 
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Murder Minute
     Don't Talk to Strangers
     True Crime All The Time Unsolved
Recommendations for This American Life: 
     The Stoop Storytelling Series
     The Story Home Children's Audio Stories
     Spooky Boo's Scary Story Time
     The Story Behind
     This is the Gospel Podcast
     1001 Heroes, Legends, Histories & Mysteries Podcast
Recommendations for Call Her Daddy: 
     Stiff Socks
     Two Judgey Girls
     NAKED with Catt Sadler
     Slay Girl Slay
     Hot Marriage. Cool Parents.
     Safe For Work
Recommendations for The Joe Rogan Experience: 
     The Creative Penn Podcast For Writers
     1001 Classic Shor

# TFIDF + Cosine Similarity 

In [46]:
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcasts_df["text"])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [47]:
for i in sample_podcasts:
    print("If you liked {}, try: ".format(i))
    recs = recommend(i, tf_cosine_sim)
    for i in recs:
        print("     {}".format(i))
    

Recommendations for The Daily: 
     Impeachment Inquiry: Updates from The Washington Post
     The 11th Hour with Brian Williams
     The Daily 202's Big Idea
     Article II: Inside Impeachment
     Impeachment: A Daily Podcast
     The Takeaway
Recommendations for Murder, etc.: 
     Murder Minute
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Don't Talk to Strangers
     True Crime All The Time Unsolved
Recommendations for This American Life: 
     Experimental Brewing
     1A
     Through the Looking Glass: A LOST Retrospective
     The Grave Talks | Haunted, Paranormal & Supernatural
     Darkness Prevails Podcast | TRUE Horror Stories
     BeerSmith Home and Beer Brewing Podcast
Recommendations for Call Her Daddy: 
     hey, girl.
     Girls Night with Stephanie May Wilson
     Stiff Socks
     Fierce Girls
     Becoming Something with Jonathan Pokluda
     Two Judgey Girls
Recommendations for The Joe Rogan Experience: 
     MILLION DOLLAR LIFE 

# Compare results of the two models

In [54]:
def compare(pod, num_recs=5):
    tf_idf_recs = recommend(pod, tf_cosine_sim, num_recs)
    cv_recs = recommend(pod, cv_cosine_sim, num_recs)

    both = list(set(tf_idf_recs).intersection(set(cv_recs)))
    unique_to_tf = list(set(tf_idf_recs).difference(set(cv_recs)))
    unique_to_cv = list(set(cv_recs).difference(set(tf_idf_recs)))
    print("Recs for {}: ".format(pod))
    
    print("    Recommended by both tf-idf and cv:")
    for i in both: print("         {}".format(i))

    print("    Uniqely recommended by tf-idf:")
    for i in unique_to_tf: print("         {}".format(i))

    print("    Uniqely recommended by cv:")
    for i in unique_to_cv: print("         {}".format(i))

In [56]:
for pod in sample_podcasts: compare(pod) 

Recs for The Daily: 
    Recommended by both tf-idf and cv:
         The Daily 202's Big Idea
         Impeachment Inquiry: Updates from The Washington Post
         The 11th Hour with Brian Williams
         The Takeaway
         Article II: Inside Impeachment
         Impeachment: A Daily Podcast
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:
Recs for Murder, etc.: 
    Recommended by both tf-idf and cv:
         Criminology
         Unsolved Murders: True Crime Stories
         Don't Talk to Strangers
         Murderville
         Murder Minute
         True Crime All The Time Unsolved
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:
Recs for This American Life: 
    Recommended by both tf-idf and cv:
    Uniqely recommended by tf-idf:
         1A
         Experimental Brewing
         BeerSmith Home and Beer Brewing Podcast
         The Grave Talks | Haunted, Paranormal & Supernatural
         Darkness Prevails Podcast | TRUE Horror Stories
     