In [1]:
import re
import nltk
import random
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 

import numpy as np
import pandas as pd
import heapq
import numpy as np
from collections import Counter

# Preparing DataFrame

In [2]:
podcasts_df = pd.read_pickle('../data/pickle_files/english_podcasts_detailed_cleaned.pkl')

In [3]:
podcasts_df['text'] = podcasts_df[['title', 'producer', 'genre', 'description', 'episode_titles', 'episode_descriptions']].apply(lambda x: ' '.join(x), axis=1)
podcasts_df = podcasts_df.drop(columns=['genre', 'description', 'num_episodes', 'rating', 'num_reviews', 'link', 'episode_titles', 'episode_descriptions'])
podcasts_df['ID'] = list(range(podcasts_df.shape[0]))

# Preprocessing Text

In [4]:
# create list of stop words
stop = get_stop_words('en')

# remove non-alphanumeric, non-space
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]

# add in custom stop words
days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

months = ['january', 'february', 'march', 'april', 'may', 'june', 
          'july', 'august', 'september', 'october', 'november', 'december']

other = ['nan', 'podcast', 'podcasts', 'every', 'new', 'weekly', 
         'stories', 'story', 'episode', 'episodes', 'listen', 
         'host', 'hosted', 'join']

[stop.append(str(day)) for day in days]
[stop.append(str(month)) for month in months]
[stop.append(str(x)) for x in other]

def topKFrequent(tokenized_text, k): 
   
    count = Counter(tokenized_text)   
    
    return heapq.nlargest(k, count.keys(), key=count.get)

def remove_stop(text, stop):
    custom_stop = stop
#     top5 = topKFrequent(text, 5)
#     custom_stop = custom_stop + top5
    
    new_text = []
    for word in text:
        if word not in custom_stop:
            new_text.append(word)
    return new_text

# create tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# create stemmer
p_stemmer = PorterStemmer()
l_stemmer = WordNetLemmatizer() 


def stem_list(text, p_stemmer):
    new_list = []
    for word in text:
        new_list.append(p_stemmer.stem(word))
    return new_list

def lem_list(text, l_stemmer):
    new_list = []
    for word in text:
        new_list.append(l_stemmer.lemmatize(word))
    return new_list

def preprocess_text(text):
    # remove mixed alphanumeric
    text = re.sub(r"""(?x) # verbose regex
                            \b    # Start of word
                            (?=   # Look ahead to ensure that this word contains...
                             \w*  # (after any number of alphanumeric characters)
                             \d   # ...at least one digit.
                            )     # End of lookahead
                            \w+   # Match the alphanumeric word
                            \s*   # Match any following whitespace""", 
                             "", text)
    
    # remove urls (will check and remove http and www later)
    text = re.sub(r'\s([\S]*.com[\S]*)\b', '', text)
    text = re.sub(r'\s([\S]*.org[\S]*)\b', '', text)
    text = re.sub(r'\s([\S]*.net[\S]*)\b', '', text)
    text = re.sub(r'\s([\S]*.edu[\S]*)\b', '', text)
    text = re.sub(r'\s([\S]*.gov[\S]*)\b', '', text)
    
    # remove non-alphanumeric, non-space
    text = re.sub(r'([^\s\w]|_)+', '', text)
    
    # tokenize text
    text = tokenizer.tokenize(text.lower())
    
    # remove stop words
    text = remove_stop(text, stop)
    
    # stem
    text = lem_list(text, l_stemmer)
    
    # remove instances of http or www
    new_text_list = []
    for word in text:
        if re.search(r'http', word):
            continue
        if re.search(r'www', word):
            continue
        new_text_list.append(word)
    
    new_text = ' '.join(new_text_list)
    
    return new_text

In [5]:
podcasts_df['text'] = podcasts_df['text'].map(preprocess_text)
podcasts_df = podcasts_df[podcasts_df.text != '']

In [6]:
podcasts_df.head()

Unnamed: 0,title,producer,text,ID
0,History Hyenas with Chris Distefano and Yannis...,RiotCast Network,history hyena chris distefano yannis pappa rio...,0
1,Curiosity Daily,Westwood One,curiosity daily westwood one education awardwi...,1
2,Spirits,Multitude,spirit multitude history boozy mythology legen...,2
3,The Soundtrack Show,iHeartRadio,soundtrack show iheartradio tv film soundtrack...,3
4,Writing Excuses,"Brandon Sanderson, Mary Robinette Kowal, Dan W...",writing excuse brandon sanderson mary kowal da...,4


# Podcast Recommendation Models

In [7]:
# helper functions
def get_title_from_index(index):
    return podcasts_df[podcasts_df.ID == index]["title"].values[0]

def get_index_from_title(title):
    return podcasts_df[podcasts_df.title == title]["ID"].values[0]

def get_recommendations(podcast_id, sim_matrix):
    recommendations = list()
    
    podcast_title = get_title_from_index(podcast_id)
    similar_podcasts =  list(enumerate(sim_matrix[podcast_id]))
    sorted_similar_podcast = sorted(similar_podcasts,key=lambda x:x[1],reverse=True)
    
    for i in range(11):
        title = get_title_from_index(sorted_similar_podcast[i][0])
        recommendations.append(title)
    
    return recommendations

test_podcasts = ['The Daily', 'Up First', 'VIEWS with David Dobrik and Jason Nash', 'Impaulsive with Logan Paul',
                 'The Bill Simmons Podcast', 'My Favorite Murder with Karen Kilgariff and Georgia Hardstark',
                 'This American Life', 'Joel Osteen Podcast', 'TED Radio Hour', 'Call Her Daddy', 
                 'Skip and Shannon: Undisputed'
                ]

### Cosine Similarity + CountVectorizer (Bag of Words) Method

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer()
cv_matrix = cv.fit_transform(podcasts_df["text"])
cv_cosine_sim = cosine_similarity(cv_matrix)

In [9]:
for i in test_podcasts:
    print(get_recommendations(get_index_from_title(i), cv_cosine_sim))
    print()

['The Daily', 'Impeachment Inquiry: Updates from The Washington Post', 'Impeachment: A Daily Podcast', 'The Takeaway', 'Article II: Inside Impeachment', "The Daily 202's Big Idea", 'The 11th Hour with Brian Williams', 'Bill O’Reilly’s No Spin News and Analysis', 'The Last Word with Lawrence O’Donnell', 'Up First', 'Impeachment Today']

['Up First', 'Impeachment: A Daily Podcast', 'Impeachment Inquiry: Updates from The Washington Post', 'The Daily', 'Article II: Inside Impeachment', 'The 11th Hour with Brian Williams', 'Bill O’Reilly’s No Spin News and Analysis', "The Daily 202's Big Idea", 'The Takeaway', 'Can He Do That?', 'The Last Word with Lawrence O’Donnell']

['VIEWS with David Dobrik and Jason Nash', 'Instant Message', 'I Am In Eskew', 'Jalen & Jacoby', 'The Axe Files with David Axelrod', 'Getting Things Done', 'The Tower', 'The Permaculture Podcast', 'Making It With Jimmy Diresta, Bob Clagett and David Picciuto', 'Psychology of Eating', 'Blank Check with Griffin & David']

['Im

### Cosine Similarity + TFIDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcasts_df["text"])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [11]:
for i in test_podcasts:
    print(get_recommendations(get_index_from_title(i), tf_cosine_sim))
    print()

['The Daily', 'Impeachment Inquiry: Updates from The Washington Post', "The Daily 202's Big Idea", 'The 11th Hour with Brian Williams', 'Article II: Inside Impeachment', 'Impeachment: A Daily Podcast', 'The Takeaway', 'The Last Word with Lawrence O’Donnell', 'Bill O’Reilly’s No Spin News and Analysis', 'The Situation Room with Wolf Blitzer', 'The Rachel Maddow Show']

['Up First', 'Impeachment Inquiry: Updates from The Washington Post', 'The Daily', 'Article II: Inside Impeachment', 'The 11th Hour with Brian Williams', 'Impeachment: A Daily Podcast', "The Daily 202's Big Idea", 'Can He Do That?', 'The Last Word with Lawrence O’Donnell', 'Bill O’Reilly’s No Spin News and Analysis', 'The Takeaway']

['VIEWS with David Dobrik and Jason Nash', 'Instant Message', 'How Did This Get Made?', 'Light The Fight- Parenting Podcast', 'Flow Sessions with Jason Silva', 'This Week in Startups - Audio', 'The Tower', 'True Cold Case Files with Jason and Daisy', 'Horror Hill: A Horror Anthology and Scary

### Cosine Similarity + Custom Word2Vec Embedding

In [12]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

In [13]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [14]:
text_list = list(podcasts_df.text)
tokenized_text = [tokenizer.tokenize(i) for i in text_list]

In [15]:
w2v_model = Word2Vec(tokenized_text, sg=1)

In [16]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2v_model)
mean_embedded = mean_embedding_vectorizer.fit_transform(podcasts_df['text'])
w2v_cosine_sim = cosine_similarity(mean_embedded)



In [17]:
for i in test_podcasts:
    print(get_recommendations(get_index_from_title(i), w2v_cosine_sim))
    print()

['The Daily', 'Impeachment: A Daily Podcast', 'Up First', 'The Takeaway', 'Can He Do That?', 'Global News Podcast', 'Skimm This', 'Mark Levin Podcast', 'Mark Levin Podcast', 'Post Reports', 'The Asset']

['Up First', 'Impeachment: A Daily Podcast', 'The Daily', 'The Takeaway', 'Mark Levin Podcast', 'Mark Levin Podcast', 'Post Reports', 'Can He Do That?', 'PBS NewsHour - Full Show', 'Skimm This', 'Global News Podcast']

['VIEWS with David Dobrik and Jason Nash', 'Amy Schumer Presents: 3 Girls, 1 Keith', 'Not Skinny But Not Fat', 'The Archers', 'The Fighter & The Kid', 'Adulting', 'Ringer Dish', 'Drew and Mike Show', 'Andrea Savage: A Grown-Up Woman #buttholes', 'The Three Questions with Andy Richter', 'Dead Pilots Society']

['Impaulsive with Logan Paul', 'Comments by Celebs', 'Drew and Mike Show', 'Heartland Radio 2.0', 'Curious with Josh Peck', 'The MeatEater Podcast', 'KFC Radio', 'Not Skinny But Not Fat', 'Who? Weekly', '#GetSome with Gary Owen', 'Ya Neva Know: you know what I mean?

### Cosine Similarity + GloVe Embedding

In [18]:
from gensim.models import KeyedVectors

In [19]:
glove_model = KeyedVectors.load_word2vec_format("../word2vec/glove.6B.50d.txt.word2vec")

In [20]:
glove_mean_embedding_vectorizer = MeanEmbeddingVectorizer(glove_model)
glove_mean_embedded = glove_mean_embedding_vectorizer.fit_transform(podcasts_df['text'])
glove_cosine_sim = cosine_similarity(glove_mean_embedded)



In [21]:
for i in test_podcasts:
    print(get_recommendations(get_index_from_title(i), glove_cosine_sim))
    print()

['The Daily', 'Up First', 'Impeachment: A Daily Podcast', 'The Lawfare Podcast', 'Bag Man', 'Can He Do That?', 'CBS This Morning', 'Impeachment Inquiry: Updates from The Washington Post', 'All In with Chris Hayes', 'The Takeaway', 'The New Yorker: Politics and More']

['Up First', 'The Daily', 'The Lawfare Podcast', 'Impeachment: A Daily Podcast', 'Bag Man', 'What Next | Daily News and Analysis', 'Impeachment Inquiry: Updates from The Washington Post', 'The Situation Room with Wolf Blitzer', 'PBS NewsHour - Segments', 'All In with Chris Hayes', 'The Lead with Jake Tapper']

['VIEWS with David Dobrik and Jason Nash', 'Help! I Suck at Dating with Dean, Vanessa and Jared', 'Amy Schumer Presents: 3 Girls, 1 Keith', 'Dead Pilots Society', "So Bad It's Good with Ryan Bailey", 'The Ben and Ashley I Almost Famous Podcast', 'Out in the Wild', 'Your 2 Dads w/ Sean & Julian', 'Good Morning From Hell', 'The Bobby Bones Show', 'Scrubbing In with Becca Tilley & Tanya Rad']

['Impaulsive with Logan P

### Cosine Similarity + Word2Vec + Smooth Inverse Frequency

In [22]:
from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD

In [23]:
def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX

def smooth_inverse_frequency(sent, a=0.001, word2vec_model=w2v_model):
    word_counter = {}
    sentences = []
    total_count = 0
    no_of_sentences = 0
    
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
        no_of_sentences = no_of_sentences + 1
    
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            if word in word2vec_model:
                emd = (a/(a + (word_counter[word]/total_count)))*word2vec_model[word]
                sent_emd.append(emd)
        sum_ = np.array(sent_emd).sum(axis=0)
        sentence_emd = sum_/float(no_of_sentences)
        sents_emd.append(sentence_emd)
        
    new_sents_emb = remove_first_principal_component(np.array(sents_emd))
    return new_sents_emb

In [24]:
sif_text_emd = smooth_inverse_frequency(text_list)
sif_cosine_sim = cosine_similarity(sif_text_emd)



In [25]:
for i in test_podcasts:
    print(get_recommendations(get_index_from_title(i), sif_cosine_sim))
    print()

['The Daily', 'Impeachment Inquiry: Updates from The Washington Post', 'The Drum Show', "Official Prime Minister's Questions (PMQs) Podcast", 'Criminal', 'Robb Wolf - The Paleo Solution Podcast - Paleo diet, nutrition, fitness, and health', 'Queens Podcast', 'Aquarium Co-Op Podcast', 'The Beauty Brains', 'Story Time', 'Plenary Session']

['Up First', 'The Paul Tripp Podcast', "The Daily 202's Big Idea", 'Stay Tuned with Preet', 'The General Hospital Podcast', 'Post Reports', 'Article II: Inside Impeachment', 'Impeachment Inquiry: Updates from The Washington Post', 'Intelligence Squared', 'WGRL NYC', 'The Drum Show']

['VIEWS with David Dobrik and Jason Nash', 'The Man With A Thousand Faces', 'Solid Joys Daily Devotional', 'The Axe Files with David Axelrod', "DJ Private Ryan's Podcast", 'Beach Too Sandy, Water Too Wet', 'Kid Friendly Joke Of The Day', 'AI: Hype vs. Reality', 'Just The Sip', 'Tumble Science Podcast for Kids', 'Binge Mode: Star Wars']

['Impaulsive with Logan Paul', 'Leve