In [2]:
import re
import nltk
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/yamini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yamini/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data Pre-Processing

In [29]:
podcasts_df = pd.read_pickle('../data/pickle_files/english_podcasts_detailed_cleaned.pkl')

In [30]:
podcasts_df['text'] = podcasts_df[['title', 'producer', 'genre', 'description', 'episode_titles', 'episode_descriptions']].apply(lambda x: ' '.join(x), axis=1)
podcasts_df = podcasts_df.drop(columns=['genre', 'description', 'num_episodes', 'rating', 'num_reviews', 'link', 'episode_titles', 'episode_descriptions'])
podcasts_df['ID'] = list(range(podcasts_df.shape[0]))

In [34]:
# utils
stop = get_stop_words('en')
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]

tokenizer = RegexpTokenizer(r'\w+')

In [35]:
def remove_stop(text, stop):
    return [word for word in text if word not in stop ]

def lemmatize(text, l_stemmer):
    return [l_stemmer.lemmatize(word) for word in text]

In [32]:
def preprocess_text(text):
    # remove mixed alphanumeric
    text = re.sub(r"""(?x) \b(?=\w*\d)\w+\s*""","", text)
    re.sub(r"http\S+", "", text)
    text = re.sub(r'([^\s\w]|_)+', '', text)
    text = tokenizer.tokenize(text.lower())
    text = remove_stop(text, stop)
    text = lemmatize(text, WordNetLemmatizer())
    
    new_text = ' '.join(text)
    return new_text

In [33]:
podcasts_df['text'] = podcasts_df['text'].map(preprocess_text)
podcasts_df = podcasts_df.query('podcasts_df.text !=""')

In [10]:
podcasts_df.head()

Unnamed: 0,title,producer,text,ID
0,History Hyenas with Chris Distefano and Yannis...,RiotCast Network,"[history, hyena, chris, distefano, yannis, pap...",0
1,Curiosity Daily,Westwood One,"[curiosity, daily, westwood, one, education, a...",1
2,Spirits,Multitude,"[spirit, multitude, history, boozy, mythology,...",2
3,The Soundtrack Show,iHeartRadio,"[soundtrack, show, iheartradio, tv, film, soun...",3
4,Writing Excuses,"Brandon Sanderson, Mary Robinette Kowal, Dan W...","[writing, excuse, brandon, sanderson, mary, ko...",4


# Podcast Recommenders

In [17]:
# helper functions
def get_title_from_index(index):
    return podcasts_df[podcasts_df.ID == index]["title"].values[0]

def get_index_from_title(title):
    return podcasts_df[podcasts_df.title == title]["ID"].values[0]

In [None]:
def recommend(podcast_title, sim_matrix, number_recs=5):
    podcast_id = get_index_from_title(podcast_title)
    similar_podcasts =  list(enumerate(sim_matrix[podcast_id]))
    sorted_similar_podcast = sorted(similar_podcasts,key=lambda x:x[1],reverse=True)
    
    recommendations = [get_title_from_index(sorted_similar_podcast[i][0]) for i in range(number_recs+2)]
    return recommendations[1:]

In [None]:
# Podcasts we'll use to validate results
sample_podcasts = ['The Daily', "Murder, etc.",'This American Life', 'Call Her Daddy', 'The Joe Rogan Experience']


### Cosine Similarity + CountVectorizer (Bag of Words) Method

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer()
cv_matrix = cv.fit_transform(podcasts_df["text"])
cv_cosine_sim = cosine_similarity(cv_matrix)

In [19]:
for i in sample_podcasts:
    print(recommend(i, cv_cosine_sim))

['The Daily', 'Impeachment Inquiry: Updates from The Washington Post', 'Impeachment: A Daily Podcast', 'The Takeaway', 'Article II: Inside Impeachment', "The Daily 202's Big Idea", 'The 11th Hour with Brian Williams', 'Bill O’Reilly’s No Spin News and Analysis', 'The Last Word with Lawrence O’Donnell', 'Up First', 'Impeachment Today']

['Up First', 'Impeachment: A Daily Podcast', 'Impeachment Inquiry: Updates from The Washington Post', 'The Daily', 'Article II: Inside Impeachment', 'The 11th Hour with Brian Williams', 'Bill O’Reilly’s No Spin News and Analysis', "The Daily 202's Big Idea", 'The Takeaway', 'Can He Do That?', 'The Last Word with Lawrence O’Donnell']

['VIEWS with David Dobrik and Jason Nash', 'Instant Message', 'I Am In Eskew', 'Jalen & Jacoby', 'The Axe Files with David Axelrod', 'Getting Things Done', 'The Tower', 'The Permaculture Podcast', 'Making It With Jimmy Diresta, Bob Clagett and David Picciuto', 'Psychology of Eating', 'Blank Check with Griffin & David']

['Im

### Cosine Similarity + TFIDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcasts_df["text"])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [21]:
for i in sample_podcasts:
    print("Recommendations for {}: ".format(i))
    print(recommend(i, tf_cosine_sim))

['The Daily', 'Impeachment Inquiry: Updates from The Washington Post', "The Daily 202's Big Idea", 'The 11th Hour with Brian Williams', 'Article II: Inside Impeachment', 'Impeachment: A Daily Podcast', 'The Takeaway', 'The Last Word with Lawrence O’Donnell', 'Bill O’Reilly’s No Spin News and Analysis', 'The Situation Room with Wolf Blitzer', 'The Rachel Maddow Show']

['Up First', 'Impeachment Inquiry: Updates from The Washington Post', 'The Daily', 'Article II: Inside Impeachment', 'The 11th Hour with Brian Williams', 'Impeachment: A Daily Podcast', "The Daily 202's Big Idea", 'Can He Do That?', 'The Last Word with Lawrence O’Donnell', 'Bill O’Reilly’s No Spin News and Analysis', 'The Takeaway']

['VIEWS with David Dobrik and Jason Nash', 'Instant Message', 'How Did This Get Made?', 'Light The Fight- Parenting Podcast', 'Flow Sessions with Jason Silva', 'This Week in Startups - Audio', 'The Tower', 'True Cold Case Files with Jason and Daisy', 'Horror Hill: A Horror Anthology and Scary