In [None]:
!pip install stop-words
!pip install pyspark
!pip install urllib3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stop-words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32911 sha256=8cd167ab61550322ac188420086442feff63153fbf07c47d72702be2ab8ef8c8
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 61 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |█████

In [None]:
import itertools
import re
import nltk
import pandas as pd
import numpy as np
import urllib3

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from stop_words import get_stop_words

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Developing a Podcast Recommender!

Final Project - APAM Senior Seminar - Fall 2022 - October 10

Yamini Ananth, Jafar Vohra, Kathy Wang, Abhiram Kolluri

# Data Pre-Processing

Data scraped from Apple Podcasts using BeautifulSoup

Scripts for web scraping attributed to [Siddharth Kumaran](https://github.com/siddgood/podcast-recommendation-engine/blob/master/scripts/get_podcast_info.py)

Scraped data includes: 
* Title (text)
* Producer (text)
* Description (text)
* 6 Recent Episode Titles (text)
* 6 Recent Episode Descriptions (text)

Pre-processing included:
* Filtered out URLs and special characters
* Tokenized (separated each word into its own string)
* Removed stop-words (common words like articles, pronouns etc)
* Lemmatized (removed endings from words, so ‘like’ and ‘likes’ and ‘likely’ would all be converted to ‘lik’)


In [None]:
##Trying to get pickle data from http

http = urllib3.PoolManager()
req = http.request("GET", "https://github.com/yaminivibha/podcast-recs/blob/main/data/data/pickle_files/english_podcasts_detailed_cleaned.pkl")
podcast_data_pkl = req.data.decode('utf-8')
#podcasts_df_orig = pd.read_pickle(podcast_data_pkl)



In [None]:
podcasts_df_orig = pd.read_pickle('/media/english_podcasts_detailed_cleaned.pkl')

In [None]:
# Combining all text data into one column for downstream analysis

podcasts_df = podcasts_df_orig
podcasts_df['text'] = podcasts_df[['title', 'producer', 'genre', 'description', 'episode_titles', 'episode_descriptions']].apply(lambda x: ' '.join(x), axis=1)
podcasts_df = podcasts_df.drop(columns=['genre', 'description', 'num_episodes', 'rating', 'num_reviews', 'link', 'episode_titles', 'episode_descriptions'])
podcasts_df['idx'] = list(range(podcasts_df.shape[0]))

In [None]:
# Creating stopwords list & tokenizer

stop = get_stop_words('en')
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
# Creating helper functions to remove stop words 
# and lemmatize tokenized sentences

def remove_stop(text, stop):
    return [word for word in text if word not in stop ]

def lemmatize(text, l_stemmer):
    return [l_stemmer.lemmatize(word) for word in text]

In [None]:
def preprocess_text(text):
    # remove mixed alphanumeric, URLS, stop words
    text = re.sub(r"""(?x) \b(?=\w*\d)\w+\s*""","", text)
    re.sub(r"http\S+", "", text)
    text = re.sub(r'([^\s\w]|_)+', '', text)
    text = tokenizer.tokenize(text.lower())
    text = remove_stop(text, stop)
    text = lemmatize(text, WordNetLemmatizer())
    
    new_text = ' '.join(text)
    return new_text

In [None]:
podcasts_df['text'] = podcasts_df['text'].map(preprocess_text)
podcasts_df = podcasts_df.query('text !=""')

# Preparing Utilities for Recommendation

Collection of helper functions

In [None]:
def get_title_from_index(index):
    """get title of podcast from index of podcast
        parameters:
            index: (int)
        returns:
            title (string)
        raises:
            ValueError: index not in podcasts_df['idx']
    """
    return podcasts_df[podcasts_df.idx == index]["title"].values[0]

def get_index_from_title(title):
    """get index of podcast from title of podcast
        parameters:
            title: (string)
        returns:
            index (int)
        raises:
            ValueError: string not in podcasts_df['title']
    """
    return podcasts_df[podcasts_df.title == title]["idx"].values[0]

In [None]:
def recommend(podcast_title, sim_matrix, number_recs=5, pretty_print=True):
    """given a podcast title & a similarity matrix, return n most similar podcasts
        parameters:
            podcast_title: (str) must be in podcasts_tf['title]
            sim_matrix: (np.array) similarity matrix
            number_recs: (int) how many recommendations do you want per title?
        returns:
            recommendations: (list[str]) list of n most similar podcasts 
                            according to the similarity matrix
    """

    podcast_id = get_index_from_title(podcast_title)
    similar_podcasts =  list(enumerate(sim_matrix[podcast_id]))
    sorted_similar_podcast = sorted(similar_podcasts,key=lambda x:x[1],reverse=True)
    
    recommendations = [get_title_from_index(sorted_similar_podcast[i][0]) for i in range(number_recs+2)]
    
    ### formatting for pretty printing ###
    if pretty_print:
      print("If you liked {}, try: ".format(podcast_title))
      for i in recommendations[1:]:
          print("     {}".format(i))
    
    return recommendations[1:]

In [None]:
# Podcasts we'll use to validate results
sample_podcasts = ['The Daily', "Murder, etc.",'This American Life', 'Call Her Daddy', 'The Joe Rogan Experience']

# Bag of Words + Cosine Similarity

Here, we use the bag of words model to encode the podcast text and use that to generate a cosine similarity matrix.

In [None]:
cv = CountVectorizer()
cv_matrix = cv.fit_transform(podcasts_df["text"])
cv_cosine_sim = cosine_similarity(cv_matrix)

In [None]:
for i in sample_podcasts:
    recs = recommend(i, cv_cosine_sim)
    print('\n')

If you liked The Daily, try: 
     Impeachment Inquiry: Updates from The Washington Post
     Impeachment: A Daily Podcast
     The Takeaway
     Article II: Inside Impeachment
     The Daily 202's Big Idea
     The 11th Hour with Brian Williams


If you liked Murder, etc., try: 
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Murder Minute
     Don't Talk to Strangers
     True Crime All The Time Unsolved


If you liked This American Life, try: 
     The Stoop Storytelling Series
     The Story Home Children's Audio Stories
     Spooky Boo's Scary Story Time
     The Story Behind
     This is the Gospel Podcast
     1001 Heroes, Legends, Histories & Mysteries Podcast


If you liked Call Her Daddy, try: 
     Stiff Socks
     Two Judgey Girls
     NAKED with Catt Sadler
     Slay Girl Slay
     Hot Marriage. Cool Parents.
     Safe For Work


If you liked The Joe Rogan Experience, try: 
     The Creative Penn Podcast For Writers
     1001 Classic Short 

In [None]:
#Try it yourself! 
your_podcast = "Song Exploder" #Replace this with a podcast of your choice!
recs = recommend(your_podcast, cv_cosine_sim)

If you liked Song Exploder, try: 
     All Songs Considered
     The Album Club
     Celebration Rock
     Song Confessional
     And The Writer Is...with Ross Golan
     The Sleeping At Last Podcast


# TFIDF + Cosine Similarity 

Here, we use tf-idf to encode the podcast text and use that to generate a cosine similarity matrix.

In [None]:
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcasts_df["text"])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [None]:
for i in sample_podcasts:
    recs = recommend(i, tf_cosine_sim)
    print('\n')

If you liked The Daily, try: 
     Impeachment Inquiry: Updates from The Washington Post
     The 11th Hour with Brian Williams
     The Daily 202's Big Idea
     Article II: Inside Impeachment
     Impeachment: A Daily Podcast
     The Takeaway


If you liked Murder, etc., try: 
     Murder Minute
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Don't Talk to Strangers
     True Crime All The Time Unsolved


If you liked This American Life, try: 
     Experimental Brewing
     1A
     Through the Looking Glass: A LOST Retrospective
     The Grave Talks | Haunted, Paranormal & Supernatural
     Darkness Prevails Podcast | TRUE Horror Stories
     BeerSmith Home and Beer Brewing Podcast


If you liked Call Her Daddy, try: 
     hey, girl.
     Girls Night with Stephanie May Wilson
     Stiff Socks
     Fierce Girls
     Becoming Something with Jonathan Pokluda
     Two Judgey Girls


If you liked The Joe Rogan Experience, try: 
     MILLION DOLLAR LIFE LE

In [None]:
#Try it yourself! 
your_podcast = "Song Exploder" #Replace this with a podcast of your choice!
recs = recommend(your_podcast, tf_cosine_sim)

If you liked Song Exploder, try: 
     All Songs Considered
     The Album Club
     Celebration Rock
     And The Writer Is...with Ross Golan
     Song Confessional
     Song Talk Radio | Songwriting Tips | Lyrics | Arranging | Live Feedback


# Compare results of the two models

We want to see whether or not the models tend to agree,
and what amount of the total body of podcasts are ever actually recommended (do we solve the long tail problem)?

In [None]:
def print_compare(pod, num_recs=5):
    """for a given podcast and number of recommendations
        print the recommendations from both tf-idf and cv
        unique to tf-idf
        and unique to cv
    """

    tf_idf_recs = recommend(pod, tf_cosine_sim, num_recs, pretty_print=False)
    cv_recs = recommend(pod, cv_cosine_sim, num_recs, pretty_print=False)

    both = list(set(tf_idf_recs).intersection(set(cv_recs)))
    unique_to_tf = list(set(tf_idf_recs).difference(set(cv_recs)))
    unique_to_cv = list(set(cv_recs).difference(set(tf_idf_recs)))
    print("Recs for {}: ".format(pod))
    
    print("    Recommended by both tf-idf and cv:")
    for i in both: print("         {}".format(i))

    print("    Uniqely recommended by tf-idf:")
    for i in unique_to_tf: print("         {}".format(i))

    print("    Uniqely recommended by cv:")
    for i in unique_to_cv: print("         {}".format(i))
    print('\n')

In [None]:
for pod in sample_podcasts: print_compare(pod) 

Recs for The Daily: 
    Recommended by both tf-idf and cv:
         The 11th Hour with Brian Williams
         Impeachment: A Daily Podcast
         Article II: Inside Impeachment
         Impeachment Inquiry: Updates from The Washington Post
         The Takeaway
         The Daily 202's Big Idea
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:


Recs for Murder, etc.: 
    Recommended by both tf-idf and cv:
         Don't Talk to Strangers
         Murder Minute
         Criminology
         Murderville
         True Crime All The Time Unsolved
         Unsolved Murders: True Crime Stories
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:


Recs for This American Life: 
    Recommended by both tf-idf and cv:
    Uniqely recommended by tf-idf:
         1A
         The Grave Talks | Haunted, Paranormal & Supernatural
         Darkness Prevails Podcast | TRUE Horror Stories
         Experimental Brewing
         BeerSmith Home and Beer Brewing Podcast
 

In [None]:
# Try it yourself!

your_podcast = "Song Exploder" #Replace this with your podcast 
print_compare(your_podcast)

Recs for Song Exploder: 
    Recommended by both tf-idf and cv:
         The Album Club
         All Songs Considered
         Celebration Rock
         And The Writer Is...with Ross Golan
         Song Confessional
    Uniqely recommended by tf-idf:
         Song Talk Radio | Songwriting Tips | Lyrics | Arranging | Live Feedback
    Uniqely recommended by cv:
         The Sleeping At Last Podcast




In [None]:
def coverage(model_name, sim_matrix, num_recs=10):
    """Track what % of the overall library of podcasts
        was ever actually recommended, when we serve
        10 recs for each podcast in the library

        parameters:
          model_name: (str) either 'tf-idf' or 'cv'
                    should correspond to the passed sim_matrix 
          sim_matrix: (np.array) an item-item similarity matrix
          num_recs: how many recs for each item in library?
        returns:
          indices: (np.array) recommended podcast indices
    """
    indices = np.argpartition(sim_matrix, -num_recs, axis=1)[:,-num_recs:]
    
    #calculating coverage:
    recommended = set(list(itertools.chain(*indices)))
    coverage = (len(recommended)/indices.shape[0])*100

    print("Stats for {} Model with {} recs".format(model_name, num_recs))
    print("    Coverage: {} %".format(coverage))
    
    return indices

In [None]:
cv_recs_10 = coverage("CountVectorizer", cv_cosine_sim, 5)
tf_idf_recs_10 = coverage("tf-idf", tf_cosine_sim, 5)

Stats for CountVectorizer Model with 5 recs
    Coverage: 100.0 %
Stats for tf-idf Model with 5 recs
    Coverage: 100.0 %


# Generating Fake User Ratings

We want to create users that have preferences.
Each of them randomly rates between 5-20 randomly selected podcasts on a scale from 1-5. 
This is a non-realistic way to generate fake user ratings (as most users like similar things, and have a pattern to how they rate things). 

In [None]:
def generate_user_ratings(users_count):
    """generates fake user ratings
      parameters:
        users_count: (int) how many fake users to generate
      returns:
        users: (pd.DataFrame) table of user, podcast, & rating    
    """
    
    user_ratings = []
    for idx, user in enumerate(np.arange(0,users_count)):
        ratings = []
        quantity_rated = np.random.randint(5,21)
        reviewed = set()
        
        for i in np.arange(quantity_rated):
            podcast =  np.random.randint(0, podcasts_df.shape[0])
            title = get_title_from_index(podcast)
            
            # don't want the same user to review 
            # the same podcast multiple times
            while (podcast in reviewed):
                podcast =  np.random.randint(0, podcasts_df.shape[0]+1)
            reviewed.add(podcast)

            rating = np.random.randint(1,6)
            ratings.append([idx, podcast, rating, title])
        
        user_df = pd.DataFrame(ratings, \
                          columns=['user_id', 'podcast_idx', 'rating', 'podcast_title'])
        user_ratings.append(user_df)
    return pd.concat(user_ratings)

In [None]:
def checkUserProfile(user_idx, pretty_print=True):
  """For a given user id, create a profile including 3 attributes

    parameters:
      user_id: (int) user id
      print: (boolean) whether or not printed outcomes are desired
    
    returns:
      user_profile: (dict) contains 3 attributes of a user profile
  """
  user_id=user_idx
  user_reviews = usr.query('user_id==@user_id') \
          .sort_values('rating', ascending=False)
  
  user_profile = {'no_reviews' : user_reviews.shape[0], 
                  'top_5_shows' : user_reviews['podcast_title'].iloc[:5].to_list(), 
                  'ave_rating' : user_reviews['rating'].mean() }
  
  
  #### formatting for pretty printing ###  
  if pretty_print:
    print(f"User #{user_id} Profile:")
    print(f"{user_profile['no_reviews']} reviews")
    print(f"Average rating: {user_profile['ave_rating']} stars")
    print(f"Top 5 shows:")
    
    for show in user_profile['top_5_shows']:
      print(f"       {show}")
    print("                ..            ")
  
  return user_profile

In [None]:
num_users = 1000
usr = generate_user_ratings(num_users)

In [None]:
#investigate a random user!
my_random_user = np.random.randint(0, num_users)
profile = checkUserProfile(my_random_user)

User #618 Profile:
14 reviews
Average rating: 3.0714285714285716 stars
Top 5 shows:
       The Vanished Podcast
       Government Accountability Office (GAO) Podcast: Watchdog Report
       Startup Hustle
       Story Break
       Middle:Below
                ..            


# Implement Collaborative Filtering

Now that we have our user rating data, we can implement collaborative filtering to generate recommendations based on user similarity. We specifically used the pyspark implementation of ALS Matrix Factorization with root mean squared error. 

We used a pyspark implementation of ALS code as published by [Jeffrey Chiang](https://github.com/chiang9/Medium_blog/blob/main/ALS_model/movielen%20ALS.ipynb)

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
     \
    .getOrCreate()

In [None]:
df = spark.createDataFrame(usr)

In [None]:
train, test = df.randomSplit([0.7,0.3],111)

In [None]:
# we use the cross validator to tune the hyperparameters
als = ALS(
         userCol="user_id", 
         itemCol="podcast_idx",
         ratingCol="rating", 
         coldStartStrategy="drop"
)

param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 100]) \
            .addGrid(als.regParam, [.1]) \
            .addGrid(als.maxIter, [10]) \
            .build()

evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3, parallelism = 6)
model = cv.fit(train)

In [None]:
best_model = model.bestModel

print(f"Rank = {best_model._java_obj.parent().getRank()}")
print(f"MaxIter = {best_model._java_obj.parent().getMaxIter()}")
print(f"RegParam = {best_model._java_obj.parent().getRegParam()}")

Rank = 100
MaxIter = 10
RegParam = 0.1


In [None]:
prediction = best_model.transform(test)
rmse = evaluator.evaluate(prediction)
print(f'RMSE = {rmse}')

# we can get the user latent factors and item latent factors from the model
user_latent_features = best_model.userFactors
item_latent_features = best_model.itemFactors

RMSE = 2.5798581166142873


In [None]:
user_recs = best_model.recommendForAllUsers(3)
user_recs_pandas = user_recs.toPandas()

In [None]:
def checkUserRecommendations(user_row_idx, pretty_print=True):
  """Print each user's profile
    and recommended future podcasts/predicted ratings

    parameters:
      user_row: (int) index of row from user_recs dataframe
      print: (boolean) whether or not printed outcomes are desired
    
    returns:
      user_recs: (list) recommended podcast titles
  """
  user_row = user_recs_pandas.iloc[user_row_idx]
  user_id = user_row['user_id']
  user_profile = checkUserProfile(user_id)
  
  user_recs=[]
  for rec in user_row['recommendations']:
    rec_idx = rec.__getitem__('podcast_idx')
    rec_title = get_title_from_index(rec_idx)
    user_recs.append(rec_title)

  #### formatting for pretty printing ###  
  if pretty_print: 
    print("We recommend the following: ")
    for rec_title in user_recs:
      print(f"       {rec_title}")
  print("\n")
  return user_recs

In [None]:
# Checking out the profiles & recommendations 
# for 10 random users

for i in np.random.randint(0, len(user_recs_pandas), 10):
  checkUserRecommendations(i)

User #782 Profile:
6 reviews
Top 5 shows:
       The Premed Years
       The Official Average Boy Podcast
       Appalachian Unsolved
       Dr. Wayne W. Dyer Podcast
       Teaching Hard History: American Slavery
                ..            
We recommend the following: 
       The Premed Years
       Heartland Radio 2.0
       The Sleeping At Last Podcast


User #906 Profile:
9 reviews
Top 5 shows:
       Guitar Music Theory
       Informed Consent
       Family Secrets
       Ain't No Such Thing - Original Southern Horror Stories
       Executive Edge
                ..            
We recommend the following: 
       Family Secrets
       Informed Consent
       The Week in Health Law


User #459 Profile:
7 reviews
Top 5 shows:
       Pregnancy Confidential
       Medical Medium Podcast
       The Rachel Maddow Show
       DISGRACELAND
       Sirenicide
                ..            
We recommend the following: 
       Pregnancy Confidential
       Hang Up and Listen
       Where S

In [None]:
#try it yourself!
my_random_user = np.random.randint(0, len(user_recs_pandas))
recs = checkUserRecommendations(my_random_user)

User #450 Profile:
19 reviews
Top 5 shows:
       Joe Budden: Meet the Musician
       Frankly Speaking About Family Medicine
       Thinking Sideways Podcast
       True Tales From Old Houses
       Breaking Into Startups
                ..            
We recommend the following: 
       14 Days with Felicity
       Noodle Loaf
       Happy Face


