In [3]:
import itertools
import re
import nltk
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from stop_words import get_stop_words

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yamini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data Pre-Processing

In [4]:
podcasts_df_orig = pd.read_pickle('../data/data/pickle_files/english_podcasts_detailed_cleaned.pkl')

In [5]:
podcasts_df = podcasts_df_orig
podcasts_df['text'] = podcasts_df[['title', 'producer', 'genre', 'description', 'episode_titles', 'episode_descriptions']].apply(lambda x: ' '.join(x), axis=1)
podcasts_df = podcasts_df.drop(columns=['genre', 'description', 'num_episodes', 'rating', 'num_reviews', 'link', 'episode_titles', 'episode_descriptions'])
podcasts_df['idx'] = list(range(podcasts_df.shape[0]))

In [6]:
# utils
stop = get_stop_words('en')
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]

tokenizer = RegexpTokenizer(r'\w+')

In [7]:
def remove_stop(text, stop):
    return [word for word in text if word not in stop ]

def lemmatize(text, l_stemmer):
    return [l_stemmer.lemmatize(word) for word in text]

In [8]:
def preprocess_text(text):
    # remove mixed alphanumeric
    text = re.sub(r"""(?x) \b(?=\w*\d)\w+\s*""","", text)
    re.sub(r"http\S+", "", text)
    text = re.sub(r'([^\s\w]|_)+', '', text)
    text = tokenizer.tokenize(text.lower())
    text = remove_stop(text, stop)
    text = lemmatize(text, WordNetLemmatizer())
    
    new_text = ' '.join(text)
    return new_text

In [9]:
podcasts_df['text'] = podcasts_df['text'].map(preprocess_text)
podcasts_df = podcasts_df.query('text !=""')

# Podcast-recommender utils

In [19]:
# helper functions
def get_title_from_index(index):
    """get title of podcast from index of podcast
        parameters:
            index: (int)
        returns:
            title (string)
        raises:
            ValueError: index not in podcasts_df
    """
    return podcasts_df[podcasts_df.idx == index]["title"].values[0]

def get_index_from_title(title):
    """get index of podcast from title of podcast
        parameters:
            title: (string)
        returns:
            index (int)
        raises:
            ValueError: string not in podcasts_df['title]
    """
    return podcasts_df[podcasts_df.title == title]["idx"].values[0]

In [20]:
def recommend(podcast_title, sim_matrix, number_recs=5):
    """given a podcast title & a similarity matrix, return n most similar podcasts
        parameters:
            podcast_title: (str) must be in podcasts_tf['title]
            sim_matrix: (np.array) similarity matrix
            number_recs: (int) how many recommendations do you want per title?
        returns:
            recommendations: (list[str]) list of n most similar podcasts 
                            according to the similarity matrix
    """

    podcast_id = get_index_from_title(podcast_title)
    similar_podcasts =  list(enumerate(sim_matrix[podcast_id]))
    sorted_similar_podcast = sorted(similar_podcasts,key=lambda x:x[1],reverse=True)
    
    recommendations = [get_title_from_index(sorted_similar_podcast[i][0]) for i in range(number_recs+2)]
    return recommendations[1:]

In [2]:
def recommend_print(podcast_title, sim_matrix, number_recs=5):
    print("If you liked {}, try: ".format(podcast_title))
    recs = recommend(podcast_title, sim_matrix, number_recs)
    for i in recs:
        print("     {}".format(i))


In [21]:
# Podcasts we'll use to validate results
sample_podcasts = ['The Daily', "Murder, etc.",'This American Life', 'Call Her Daddy', 'The Joe Rogan Experience']

# Bag of Words + Cosine Similarity

In [22]:
cv = CountVectorizer()
cv_matrix = cv.fit_transform(podcasts_df["text"])
cv_cosine_sim = cosine_similarity(cv_matrix)

In [23]:
for i in sample_podcasts:
    print("If you liked {}, try: ".format(i))
    recs = recommend(i, cv_cosine_sim)
    for i in recs:
        print("     {}".format(i))


If you liked The Daily, try: 
     Impeachment Inquiry: Updates from The Washington Post
     Impeachment: A Daily Podcast
     The Takeaway
     Article II: Inside Impeachment
     The Daily 202's Big Idea
     The 11th Hour with Brian Williams
If you liked Murder, etc., try: 
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Murder Minute
     Don't Talk to Strangers
     True Crime All The Time Unsolved
If you liked This American Life, try: 
     The Stoop Storytelling Series
     The Story Home Children's Audio Stories
     Spooky Boo's Scary Story Time
     The Story Behind
     This is the Gospel Podcast
     1001 Heroes, Legends, Histories & Mysteries Podcast
If you liked Call Her Daddy, try: 
     Stiff Socks
     Two Judgey Girls
     NAKED with Catt Sadler
     Slay Girl Slay
     Hot Marriage. Cool Parents.
     Safe For Work
If you liked The Joe Rogan Experience, try: 
     The Creative Penn Podcast For Writers
     1001 Classic Short Stories 

# TFIDF + Cosine Similarity 

In [24]:
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcasts_df["text"])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [25]:
for i in sample_podcasts:
    print("If you liked {}, try: ".format(i))
    recs = recommend(i, tf_cosine_sim)
    for i in recs:
        print("     {}".format(i))
    

If you liked The Daily, try: 
     Impeachment Inquiry: Updates from The Washington Post
     The 11th Hour with Brian Williams
     The Daily 202's Big Idea
     Article II: Inside Impeachment
     Impeachment: A Daily Podcast
     The Takeaway
If you liked Murder, etc., try: 
     Murder Minute
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Don't Talk to Strangers
     True Crime All The Time Unsolved
If you liked This American Life, try: 
     Experimental Brewing
     1A
     Through the Looking Glass: A LOST Retrospective
     The Grave Talks | Haunted, Paranormal & Supernatural
     Darkness Prevails Podcast | TRUE Horror Stories
     BeerSmith Home and Beer Brewing Podcast
If you liked Call Her Daddy, try: 
     hey, girl.
     Girls Night with Stephanie May Wilson
     Stiff Socks
     Fierce Girls
     Becoming Something with Jonathan Pokluda
     Two Judgey Girls
If you liked The Joe Rogan Experience, try: 
     MILLION DOLLAR LIFE LESSONS
  

# Compare results of the two models

In [26]:
def print_compare(pod, num_recs=5):
    """for a given podcast and number of recommendations
        print the recommendations from both tf-idf and cv
        unique to tf-idf
        and unique to cv
    """

    tf_idf_recs = recommend(pod, tf_cosine_sim, num_recs)
    cv_recs = recommend(pod, cv_cosine_sim, num_recs)

    both = list(set(tf_idf_recs).intersection(set(cv_recs)))
    unique_to_tf = list(set(tf_idf_recs).difference(set(cv_recs)))
    unique_to_cv = list(set(cv_recs).difference(set(tf_idf_recs)))
    print("Recs for {}: ".format(pod))
    
    print("    Recommended by both tf-idf and cv:")
    for i in both: print("         {}".format(i))

    print("    Uniqely recommended by tf-idf:")
    for i in unique_to_tf: print("         {}".format(i))

    print("    Uniqely recommended by cv:")
    for i in unique_to_cv: print("         {}".format(i))

In [27]:
for pod in sample_podcasts: print_compare(pod) 

Recs for The Daily: 
    Recommended by both tf-idf and cv:
         The Takeaway
         Impeachment: A Daily Podcast
         The 11th Hour with Brian Williams
         Article II: Inside Impeachment
         Impeachment Inquiry: Updates from The Washington Post
         The Daily 202's Big Idea
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:
Recs for Murder, etc.: 
    Recommended by both tf-idf and cv:
         Criminology
         Unsolved Murders: True Crime Stories
         Murder Minute
         Don't Talk to Strangers
         True Crime All The Time Unsolved
         Murderville
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:
Recs for This American Life: 
    Recommended by both tf-idf and cv:
    Uniqely recommended by tf-idf:
         Darkness Prevails Podcast | TRUE Horror Stories
         The Grave Talks | Haunted, Paranormal & Supernatural
         Through the Looking Glass: A LOST Retrospective
         1A
         Experimental Brewi

In [78]:
def coverage(model_name, sim_matrix, num_recs=10):
    """keep track of the top n recommendations for each podcast
    """
    indices = np.argpartition(sim_matrix, -num_recs, axis=1)[:,-num_recs:]
    
    #calculating coverage:
    recommended = set(list(itertools.chain(*indices)))
    coverage = (len(recommended)/indices.shape[0])*100

    print("Stats for {} Model with {} recs".format(model_name, num_recs))
    print("    Coverage: {} %".format(coverage))
    
    return indices

In [80]:
cv_recs_10 = coverage("CountVectorizer", cv_cosine_sim, 5)
tf_idf_recs_10 = coverage("tf-idf", tf_cosine_sim, 5)

#Note: Are these accurate?

Stats for CountVectorizer Model with 5 recs
    Coverage: 100.0 %
Stats for tf-idf Model with 5 recs
    Coverage: 100.0 %


# Generating Fake User Ratings

In [98]:
# We want to create users that have preferences
# Each of them randomly rates 5-20 random podcasts 
# This is a bad way to generate fake user ratings
# but for now it'll do

In [93]:
users_count = 10000
podcasts = np.arange(0, podcasts_df.shape[0]+1)

In [94]:
def generate_user_ratings(users_count):
    user_ratings = []
    for idx, user in enumerate(np.arange(0,users_count)):
        ratings = []
        quantity_rated = np.random.randint(5,21)
        reviewed = set()
        
        for i in np.arange(quantity_rated):
            podcast =  np.random.randint(0, podcasts_df.shape[0]+1)
            
            # don't want the same user to review the same podcast multiple times
            while (podcast in reviewed):
                podcast =  np.random.randint(0, podcasts_df.shape[0]+1)
            reviewed.add(podcast)

            rating = np.random.randint(0,6)
            ratings.append([podcast, rating])
        
        user_df = pd.DataFrame(ratings, columns=['podcast_idx', 'rating'])
        user_df['user_id'] = idx

        user_ratings.append(user_df)

    return pd.concat(user_ratings)


In [99]:
usr = generate_user_ratings(1000)

In [110]:
usr.pivot_table('podcast_idx', index='user_id', columns='rating', aggfunc='count', margins=True)

rating,0,1,2,3,4,5,All
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,6
1,1.0,2.0,2.0,1.0,2.0,,8
2,,4.0,2.0,2.0,5.0,3.0,16
3,3.0,1.0,2.0,6.0,3.0,2.0,17
4,1.0,4.0,3.0,2.0,3.0,1.0,14
...,...,...,...,...,...,...,...
996,4.0,2.0,6.0,5.0,2.0,1.0,20
997,1.0,3.0,,,2.0,1.0,7
998,1.0,1.0,1.0,,2.0,1.0,6
999,,1.0,2.0,4.0,3.0,2.0,12
