In [35]:
import pandas as pd
import re

In [36]:
import spacy


In [37]:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [39]:
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF, TruncatedSVD

In [40]:
from gensim import corpora, models, matutils

In [41]:
import numpy as np

## Cleaning podcast descriptions

In [42]:
podcast_names_full= pd.read_pickle('just_podcasts.pkl')

In [43]:
#350 podcasts
podcast_names_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Podcast_Name         350 non-null    object
 1   Podcast_ShowID       350 non-null    object
 2   Podcast_Description  350 non-null    object
dtypes: object(3)
memory usage: 8.3+ KB


In [44]:
podcast_names_full[podcast_names_full.Podcast_Name.duplicated()]

Unnamed: 0,Podcast_Name,Podcast_ShowID,Podcast_Description
45,Outer Limits Of Inner Truth,2urV9aIOdLTipAwJ4C3IPu,The Outer Limits of Inner Truth is a program a...
150,Shit We Don't Tell Mom,26sYAa0ZTK7pbCCOYNYY54,We have depression. Now what? Throw two depres...
200,Twenty Something Girl,1mt4xNvuz6BVMrD8H9EbpA,Twenty Something Girl is a lifestyle podcast t...
250,Hour of Truth & Power,0W7fw3JvjCHTomEUgx2z6U,Godcast of High Frequency Enlightenment with f...
300,Don’t Sleep on the Couch Podcast,0xIme1U7WnLvUPsDeocYoh,"Formed in 2001, the Don't Sleep on the Couch (..."
346,NAH Podcast,0muSoy4HndaTpELvVDu1iW,Hey Hey! My name is Han or Hannah. Whichever y...


In [45]:
podcast_names_df = podcast_names_full.drop_duplicates(subset=['Podcast_Name'])

In [46]:
podcast_names_df.reset_index(inplace=True)

### Preprocessing

In [47]:
nlp = spacy.load('en_core_web_sm')

In [48]:
import string
def clean_regex(series):
    # remove digits
    desc = series.apply(lambda x: re.sub('\d', ' ', x ))
    # remove \xa0 from string in Python: https://stackoverflow.com/questions/10993612/how-to-remove-xa0-from-string-in-python
    desc = desc.apply(lambda x: x.replace(u'\xa0', u''))
    #remove the | and > symbols
    desc = desc.apply(lambda x: re.sub('\|.+', ' ', x))
    desc = desc.apply(lambda x: re.sub('\>.+', ' ', x))
    #remove websites and info that comes after (seems like sponsorship)
    desc = desc.apply(lambda x: re.sub('http.+', ' ', x))
    desc = desc.apply(lambda x: re.sub('www.+', ' ', x))
    #add in space before capital letters if none (some are combined together): referred https://stackoverflow.com/questions/199059/a-pythonic-way-to-insert-a-space-before-capital-letters)
    desc = desc.apply(lambda x: re.sub("([A-Z])(?![A-Z])", r"\1", x))
    #lowercase
    desc=desc.apply(lambda x: x.lower())
    #remove punctuation
    desc = desc.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    return desc

In [49]:
cleaned_pod = clean_regex(podcast_names_df['Podcast_Description'])

In [50]:
mwe_tokenizer = MWETokenizer([('personal', 'development'), ('mental', 'health'),('social', 'media'), ('mental', 'illness')])


In [51]:
tokenized = cleaned_pod.apply(lambda x: mwe_tokenizer.tokenize(word_tokenize(x)))

In [52]:
tokenized

0      [hi, my, name, is, logan, isfeld, i, am, years...
1      [being, black, in, has, its, own, challenges, ...
2      [the, aubrey, marcus, podcast, is, an, illumin...
3      [millions, of, eyes, watching, the, pressure, ...
4      [shrugged, collective, is, a, network, of, fit...
                             ...                        
339    [girls, kickin, up, the, country, is, an, aust...
340    [hello, and, welcome, to, happy, and, healthy,...
341    [ronald, e, bachman, fsa, maaa, chc, president...
342    [atkins, et, al, toward, the, integration, of,...
343    [my, name, is, mauricio, perez, vimukta, i, am...
Name: Podcast_Description, Length: 344, dtype: object

In [53]:
cleaned_pod_strings = tokenized.apply(lambda x: " ".join(x))

In [54]:
cleaned_pod_strings[10]

'the podcast for women of color who affirm their worth value mental_health and seek wholeness biweekly mental_health podcast hosted by davia roberts lpc licensed in wi as of october the affirm podcast has discontinued will no longer release episodes thank you for your support'

In [55]:
#helper function to add nouns a second time (upweighing nouns), given string 
def dup_nouns(string_words):
    tokens = pos_tag(word_tokenize(string_words))
    li = []
    for token in tokens:
        li.append(token[0])
        if (token[1] == 'NN') or (token[1]=='NNS'):
            li.append(token[0])
    return li

In [56]:
nouns_doubled = cleaned_pod_strings.apply(dup_nouns)

In [57]:
lemmatizer = WordNetLemmatizer()

In [58]:
#given list of words, lemmatize words
def lem(low):
    lemmed = [lemmatizer.lemmatize(word) for word in low]
    return lemmed

In [59]:
tokenized_lem = nouns_doubled.apply(lambda x: lem(x))

In [60]:
tokenized_lem

0      [hi, hi, my, name, name, is, logan, isfeld, is...
1      [being, black, in, ha, it, own, challenge, cha...
2      [the, aubrey, marcus, marcus, podcast, podcast...
3      [million, million, of, eye, eye, watching, the...
4      [shrugged, collective, collective, is, a, netw...
                             ...                        
339    [girl, girl, kickin, up, the, country, country...
340    [hello, hello, and, welcome, welcome, to, happ...
341    [ronald, ronald, e, bachman, fsa, fsa, maaa, m...
342    [atkins, atkins, et, al, toward, the, integrat...
343    [my, name, name, is, mauricio, perez, perez, v...
Name: Podcast_Description, Length: 344, dtype: object

In [127]:
from nltk.corpus import stopwords
default_stop = stopwords.words('english')
custom_stop = ["twitter", "instagram", "follow", "youtube", "spotify", "check", 'help', 'ha', 'episode', 'thing', "like", "one", "podcast", "also", "too", "much", "subscriber", "hi", "hello", 'paid', 'week']
#my full list of stop words
full_list = default_stop + custom_stop 

In [128]:
podcast_names_df['Cleaned_Desc'] = tokenized_lem

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  podcast_names_df['Cleaned_Desc'] = tokenized_lem


In [129]:
text = podcast_names_df['Cleaned_Desc'].apply(lambda x: " ".join(x))

### Vectorizer

In [130]:
#NMF with TF-IDF
tfidf_vec = TfidfVectorizer(stop_words=full_list, min_df=2, max_df=0.8, ngram_range=(1,2))

In [131]:
doc_term_tfidf = tfidf_vec.fit_transform(text)

In [132]:
dtm_tfidf = pd.DataFrame(doc_term_tfidf.toarray(), columns = tfidf_vec.get_feature_names_out())

dtm_tfidf

Unnamed: 0,abc,abc abc,ability,ability ability,able,absolutely,abu,abuse,abuse abuse,abuse mental_health,...,youre thinking,youre youre,youth,youth youth,youve,youve got,youve youve,yr,zu,zu zu
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.308588,0.154294,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
340,0.0,0.0,0.0,0.0,0.029688,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
341,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.03894,0.01947,0.022566,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
342,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


### Topic Modeling 

In [196]:
nmf_act = NMF(10, init = 'nndsvda', max_iter=500)

In [197]:
nmf = nmf_act.fit(dtm_tfidf)

In [198]:
# Function to display the top n terms in each topic- sourced from Metis
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

In [199]:
# output contents for each topic - TFIDF Vectorizer with NMF

output = display_topics(nmf, tfidf_vec.get_feature_names_out(), 10)
output;


Topic  1
story, guest, story story, show, award, stigma, interview, professional, work, host

Topic  2
relationship, sex, relationship relationship, life, friendship, friend, welcome, sex sex, career, host

Topic  3
season, season season, topic, pain, health, science, school, mental_illness, identity, series

Topic  4
health, fitness, health health, wellness, fitness fitness, show, wellness wellness, nutrition, business, dr

Topic  5
woman, woman woman, everything, faith, career, body, talk, host, marriage, let

Topic  6
year, conversation, issue, student, school, education, college, year year, experience, mental_health mental_health

Topic  7
life, life life, people, share, experience, way, journey, people people, year, challenge

Topic  8
loss, weight, journey, loss loss, surgery, weight loss, pound, tip, journey journey, bypass

Topic  9
support, art, support support, culture, art art, people, youd, become, please, art culture

Topic  10
im, therapy, im im, therapist, therapy thera

In [200]:
topics = ['story', 'relationship', 'season', 'fitness', 'women', 'student','life','weight loss', 'art', 'therapy']

In [201]:
podcast_doc_topic = nmf.transform(dtm_tfidf)

In [202]:
podcast_doc_topic

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.05869527],
       [0.00387598, 0.02641184, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00817006, 0.15756237, 0.03108865, ..., 0.00366504, 0.        ,
        0.01759739],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.00754332,
        0.        ],
       [0.        , 0.        , 0.00246465, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.000387  , ..., 0.        , 0.07483999,
        0.1006183 ]])

In [203]:
#to dataframe
podcast_doc_topic_df = pd.DataFrame(podcast_doc_topic.round(3), columns = topics)

In [204]:
podcast_doc_topic_df.head()

Unnamed: 0,story,relationship,season,fitness,women,student,life,weight loss,art,therapy
0,0.0,0.0,0.0,0.0,0.0,0.121,0.012,0.0,0.0,0.059
1,0.004,0.026,0.0,0.0,0.011,0.137,0.009,0.0,0.0,0.0
2,0.008,0.158,0.031,0.16,0.0,0.022,0.0,0.004,0.0,0.018
3,0.033,0.003,0.007,0.0,0.0,0.04,0.229,0.005,0.0,0.0
4,0.013,0.0,0.0,0.256,0.0,0.0,0.0,0.0,0.0,0.0


Testing out the recommendation:

In [207]:
dist = pairwise_distances(np.array(podcast_doc_topic[314]).reshape(1,-1), podcast_doc_topic, metric = 'cosine')

In [208]:
dist.argsort()[0]

array([314, 274,   7, 134,   4, 302,  59, 277,  16, 221, 120, 133, 238,
       136, 256,  95,  56, 106, 206, 324,  50, 186, 160, 289, 263, 192,
        23, 327,  20,  87, 149, 194, 262, 202, 261, 340, 258,  40, 146,
        78, 268,  14, 164, 310,   2, 237, 112, 104,  57,  84, 284, 286,
       247, 113, 177, 176,  64,  18, 341, 307, 173, 306, 126,  61,   5,
       152, 212, 102, 214, 183, 257, 224, 236, 326, 255, 196, 260, 122,
       272, 282, 251, 220,  13,  12, 222,  19, 158,  70,  86,  33, 270,
       138,  90, 217, 156, 157, 182, 229, 128, 174, 254, 301, 142, 169,
       115,  74, 339, 313,  77,  91,   6, 322, 304, 153,  24, 216,  34,
        72, 292, 114,  46, 205,  93, 279,  60, 127, 249, 166,  25,  97,
       259, 253, 243, 147, 116, 125, 154,  94, 159, 148,  51, 319,  65,
       178, 250, 184, 335, 225,  49, 311, 330, 181,  99,  11,  63,  35,
       110,  38, 130, 323, 240, 150, 198, 232,  69, 187, 231,  53, 223,
       234,  39, 185,  73, 318,  98,  83,  32, 213, 271, 190, 20

In [209]:
dist.argsort()[0][1:10]

array([274,   7, 134,   4, 302,  59, 277,  16, 221])

In [210]:
podcast_names_df.Podcast_Description[314]

'Glen Alex is Author of Living In Total Health, the 2021 Indie Book Award Winner for Health/Wellness and Finalist in the Mind, Body, Spirit category, a Clinical Social Worker, and the Wellth Coach.The Glen Alex Show is all about health! Each episode focuses on a specific area of your health, including physical, mental, emotional, and spiritual health. We focus on the whole person because all of you matter. Glen Alex and her guests provide valuable information and insights to help you be joyful, connected, confident, and complete--our mission!About Glen AlexGlen Alex, author of Living in Total Health, has a mission to help people be more joyful, connected, confident and complete. This is a life experience she refers to as Wellth: Health + other riches in life. Glen is a Licensed Clinical Social Worker who delivers counseling and coaching services and is the guiding spirit behind The Glen Alex Show, GlenAlex.com, Healthy Boundaries for Overwhelmed Women online course, and other services 

In [211]:
podcast_names_df.Podcast_Description[274]

'Joytme Fitness presents Be Extraordinary Podcast. Tune in to hear conversations with leaders and influencers that will inspire you to reach your full potential. The show has two HOST "Joetta" 1988, 1992, 1996, 2000 Olympian, Motivator, Author and Leading Authority on Health/Wellness and Achievement. " Tyrone" Retired Firefighter, Sports/Mental Coach and Purpose Master Motivator.'

In [212]:
podcast_names_df.Podcast_Description[7]

'The Your Mind Matters podcast is a weekly show where we chat all things mental wellness, lifestyle, and I share my stories while answering listener questions. Hosted by Natalie Bally, a health and fitness content creator who has been sharing her journey since 2018 on both Instagram and YouTube. My favorite thing to do is talk about all the random *and sometimes insightful* thoughts in my head so join me weekly to hear what’s going on up there 🙃'

In [79]:
#pickling both the unique podcasts dataframe and doc_topic dataframe 
podcast_names_df.to_pickle('unique_podcastnames.pkl')

In [213]:
podcast_doc_topic_df.to_pickle('podcast_doc_topic.pkl')

## Recommender

In [214]:
ep_doc_topic = pd.read_pickle('doc_topic2.pkl')

In [215]:
episode_df = pd.read_pickle('mh_podcasts_unique.pkl')

In [244]:
podcast_names_df

Unnamed: 0,index,Podcast_Name,Podcast_ShowID,Podcast_Description,Cleaned_Desc
0,0,(2020) Mental Health Explained | Created By Yo...,4pwPCZriBVbcLcufvtchsP,"Hi, my name is Logan Isfeld, I am 17 years old...","[hi, hi, my, name, name, is, logan, isfeld, is..."
1,1,Being African American in 2021 and dealing wit...,4eoXzwruqyu2yAh4jYA7EM,Being black in 2021 has its own challenges and...,"[being, black, in, ha, it, own, challenge, cha..."
2,2,Aubrey Marcus Podcast,0n7j2qseg6fu0Fj2dvzXVi,The Aubrey Marcus Podcast is an illuminating c...,"[the, aubrey, marcus, marcus, podcast, podcast..."
3,3,Unfazed and Unbothered with Tasia and Camo,6MZJi1fkxSbqjfQiSqC5OL,"Millions of eyes watching, the pressure, the n...","[million, million, of, eye, eye, watching, the..."
4,4,Barbell Shrugged,6MFeb0x9bw9wjrphztLSn9,"Shrugged Collective is a network of fitness, h...","[shrugged, collective, collective, is, a, netw..."
...,...,...,...,...,...
339,344,Welcome to GKUTC,4kvLOHbayUXH6QZBPQ2OPV,Girls Kickin Up The Country is an Australian A...,"[girl, girl, kickin, up, the, country, country..."
340,345,Happy and Healthy Mind with Dr. Rozina,5XwuvVKnlVtKNBluBl0ITY,Hello and welcome to Happy and Healthy mind wi...,"[hello, hello, and, welcome, welcome, to, happ..."
341,347,Healthcare Insight,5GO3DnQpENyNVJymwG8BjU,"Ronald E. Bachman FSA, MAAA, CHC President & ...","[ronald, ronald, e, bachman, fsa, fsa, maaa, m..."
342,348,Mental Health Education in High Schools,2Ow2pcCGA3rcRDVxSjhI6C,Atkins et al. (2010). Toward the integration o...,"[atkins, atkins, et, al, toward, the, integrat..."


In [243]:
podcast_names_df[podcast_names_df["Podcast_Name"] == 'Healthcare Insight']

Unnamed: 0,index,Podcast_Name,Podcast_ShowID,Podcast_Description,Cleaned_Desc
341,347,Healthcare Insight,5GO3DnQpENyNVJymwG8BjU,"Ronald E. Bachman FSA, MAAA, CHC President & ...","[ronald, ronald, e, bachman, fsa, fsa, maaa, m..."


In [262]:
i = podcast_names_df[podcast_names_df["Podcast_Name"] == 'Barbell Shrugged'].index[0]

In [263]:
i

4

In [219]:
def recommend_podcast(podcast_name, pod_df, pod_doc_topic):
    podcast_row = pod_df[pod_df['Podcast_Name'] == podcast_name]
    podcast_index = podcast_row.index[0]
    #print('Given: ', podcast_name)
    #print('Description: ', pod_df.iloc[podcast_index]['Podcast_Description'])
    dist = pairwise_distances(np.array(pod_doc_topic.iloc[podcast_index]).reshape(1,-1), pod_doc_topic, metric = 'cosine')
    rec_pod_index = dist.argsort()[0][1]
    rec_pod_name = pod_df.iloc[rec_pod_index]['Podcast_Name']
    #print('\nRecommended Podcast: ', rec_pod_name)
    #print('Description: ', pod_df.iloc[rec_pod_index]['Podcast_Description'])
    return rec_pod_name

In [220]:
recommend_podcast('Barbell Shrugged', podcast_names_df, podcast_doc_topic_df)

'Be Extraordinary  Joetta Clark Olympian/Motivator & Tyrone Retired Firefighter/ Mental Coach '

In [227]:
def recommend_episode(ep_index, ep_df, ep_doc_topic, shortened_doc_topic=None):
    if shortened_doc_topic is not None:
        dist = pairwise_distances(np.array(ep_doc_topic.iloc[ep_index]).reshape(1,-1), shortened_doc_topic, metric = 'cosine')
        rec_ep_index = dist.argsort()[0][1]
        rec_ep_name = ep_df.reset_index().iloc[rec_ep_index]['Ep_name']
    else: 
        dist = pairwise_distances(np.array(ep_doc_topic.iloc[ep_index]).reshape(1,-1), ep_doc_topic, metric = 'cosine')
        rec_ep_index = dist.argsort()[0][1]
        rec_ep_name = ep_df.iloc[rec_ep_index]['Ep_name']
    return rec_ep_index, rec_ep_name 

In [228]:
recommend_episode(250, episode_df, ep_doc_topic)

(14560, 'Coming To Terms')

In [450]:
# episode_df[episode_df.Ep_name.duplicated()]

In [609]:
episode_df.iloc[16820]['Ep_desc']

'Enjoy our conversation with my guests today while we talk about motivation, in higher education and higher Fitness Levels.\xa0 Both of my guests are inspiring-wives, mothers, entrepreneurs and educators. Lina Mendez is an Associate Director of ChicanX & Latinx Retention Initiatives at UC Davis. She holds a BA degree in education from New Mexico State, Masters from Harvard, and PhD from UC Davis. Find her @linarmendez for Twitter. \xa0Nicole Sims is an International Federation of BodyBuilding and fitness Professional. She received her BA in Communications from Washington State. She has her own business as a Health and Accountability Coach in the Atlanta, Georgia area. Find her on Instagram @nicolesimswellness. For suggestions: website: www.fuertefitness.com\xa0 mail us at: fuertefitness@gmail.com Facebook page: https://www.facebook.com/fuertefitness/Instagram: @fuertefitness and @funkiecoldmedina'

In [229]:
def rec_ep_diff_podcast(ep_index, pod_index, ep_df, ep_doc_topic, pod_doc_topic):
    pod_dist = pairwise_distances(np.array(pod_doc_topic.iloc[pod_index]).reshape(1,-1), pod_doc_topic, metric = 'cosine')
    ep_dist =  pairwise_distances(np.array(ep_doc_topic.iloc[ep_index]).reshape(1,-1), ep_doc_topic, metric = 'cosine')
    #making dataframe to organize summed cosine similarity
    start_df = ep_df['Podcast_Name'].to_frame()
    start_df['Pod_Index'] = start_df['Podcast_Name'].map(pod_dict).str[0]
    ep_cos_df = pd.DataFrame(ep_dist.reshape(-1,1), columns=['Cosine Similarity for Episodes'])
    full = start_df.join(ep_cos_df)
    full['Cosine Similarity for Podcasts']= full['Pod_Index'].apply(lambda x: pod_dist[0][x])
    full['Summed Similarity'] = full["Cosine Similarity for Episodes"] + full["Cosine Similarity for Podcasts"]
    #find row for smallest summed similarity, excluding rows with same podcast name
    rec_ep_index = full[full['Pod_Index']!= pod_index].sort_values(by=['Summed Similarity']).index[0]
    return rec_ep_index
    

In [224]:
#for Different podcast: first finds podcast rec then closest cosine similarity from there
def recommender(ep_index, ep_df, pod_df, ep_doc_topic, pod_doc_topic, podcast_choice):
    episode_name = ep_df.iloc[ep_index]['Ep_name']
    print('Episode Name: ', episode_name)
    print('Episode Date: ', ep_df.iloc[ep_index]['Ep_date'])
    #ep_row = ep_df[ep_df['Ep_Name'] == episode_name]
    #ep_index = ep_row.index[0]
    podcast_name = ep_df.iloc[ep_index]['Podcast_Name']
    print('From podcast: ', podcast_name)
    print('Description: ', ep_df.iloc[ep_index]['Ep_desc'])
    if podcast_choice == 'Different':
        rec_pod_name = recommend_podcast(podcast_name, pod_df, pod_doc_topic)
        print(rec_pod_name)
        limited_indices = ep_df.index[ep_df['Podcast_Name'] == rec_pod_name].tolist()
        limited_ep_df = ep_df.iloc[limited_indices]
        limited_doc_topic = ep_doc_topic.iloc[limited_indices]
        rec_ep_index, _ = recommend_episode(ep_index, limited_ep_df, ep_doc_topic, shortened_doc_topic=limited_doc_topic)
        #updating these because returned indices are from shortened array
        rec_ep_name = limited_ep_df.reset_index().iloc[rec_ep_index]['Ep_name']
        rec_ep_index = limited_ep_df.reset_index().iloc[rec_ep_index]['level_0']
        #print(rec_ep_index, rec_ep_name)
    if podcast_choice == 'Same':
        limited_indices = ep_df.index[ep_df['Podcast_Name'] == podcast_name].tolist()
        limited_ep_df = ep_df.iloc[limited_indices]
        limited_doc_topic = ep_doc_topic.iloc[limited_indices]
        rec_ep_index, _ = recommend_episode(ep_index, limited_ep_df, ep_doc_topic, shortened_doc_topic= limited_doc_topic)
        #updating these because returned indices are from shortened array
        rec_ep_name = limited_ep_df.reset_index().iloc[rec_ep_index]['Ep_name']
        rec_ep_index = limited_ep_df.reset_index().iloc[rec_ep_index]['level_0']
    if podcast_choice == 'Any':
        rec_ep_index, rec_ep_name = recommend_episode(ep_index, ep_df, ep_doc_topic)
    print('Recommended episode: ', rec_ep_name)
    rec_ep_info = ep_df.iloc[rec_ep_index]
    print('From podcast: ', rec_ep_info['Podcast_Name'])
    print('Date: ', rec_ep_info['Ep_date'])
    print('Description: ', rec_ep_info['Ep_desc'])
        

In [233]:
#dictionary of podcast names to indices
pod_dict = podcast_names_df.groupby('Podcast_Name').indices

In [230]:
##for Different podcast: this does not filter first, but takes everything (both doc-topics) as whole 

def recommender2(ep_index, ep_df, pod_df, ep_doc_topic, pod_doc_topic, podcast_choice):
    episode_name = ep_df.iloc[ep_index]['Ep_name']
    print('Episode Name: ', episode_name)
    print('Episode Date: ', ep_df.iloc[ep_index]['Ep_date'])
    #ep_row = ep_df[ep_df['Ep_Name'] == episode_name]
    #ep_index = ep_row.index[0]
    podcast_name = ep_df.iloc[ep_index]['Podcast_Name']
    print('From podcast: ', podcast_name)
    print('Description: ', ep_df.iloc[ep_index]['Ep_desc'])
    if podcast_choice == 'Different':
        podcast_index = pod_df[pod_df["Podcast_Name"]== podcast_name].index[0]
        rec_ep_index = rec_ep_diff_podcast(ep_index, podcast_index, ep_df, ep_doc_topic, pod_doc_topic)
        rec_ep_name = ep_df.iloc[rec_ep_index]['Ep_name']
    if podcast_choice == 'Same':
        limited_indices = ep_df.index[ep_df['Podcast_Name'] == podcast_name].tolist()
        limited_ep_df = ep_df.iloc[limited_indices]
        limited_doc_topic = ep_doc_topic.iloc[limited_indices]
        rec_ep_index, _ = recommend_episode(ep_index, limited_ep_df, ep_doc_topic, shortened_doc_topic= limited_doc_topic)
        #updating these because returned indices are from shortened array
        rec_ep_name = limited_ep_df.reset_index().iloc[rec_ep_index]['Ep_name']
        rec_ep_index = limited_ep_df.reset_index().iloc[rec_ep_index]['level_0']
    if podcast_choice == 'Any':
        rec_ep_index, rec_ep_name = recommend_episode(ep_index, ep_df, ep_doc_topic)
    print('Recommended episode: ', rec_ep_name)
    rec_ep_info = ep_df.iloc[rec_ep_index]
    print('From podcast: ', rec_ep_info['Podcast_Name'])
    print('Date: ', rec_ep_info['Ep_date'])
    print('Description: ', rec_ep_info['Ep_desc'])
        

In [234]:
recommender2(500, episode_df, podcast_names_df, ep_doc_topic, podcast_doc_topic_df, podcast_choice='Different')

Episode Name:  Strength PHD: The Simplified Equations of Strength and Conditioning w/ Anders Varner, Doug Larson, and Travis Mash  - Barbell Shrugged #526
Episode Date:  2020-11-30
From podcast:  Barbell Shrugged
Description:  Buy Strength PHD and help support weightlifters at Lenoir-Rhyne University   In this Episode of Barbell Shrugged:  Why every coach needs to understand the fundamental equations of strength How to incorporate these equations into a training program. How to master energy systems  What is impulse and how to improve it Nutrition for hypertrophy  Buy Strength PHD and help support weightlifters at Lenoir-Rhyne University   Anders Varner on Instagram Doug Larson on Instagram Coach Travis Mash on Instagram ———————————————— Training Programs to Build Muscle: https://bit.ly/34zcGVw   Nutrition Programs to Lose Fat and Build Muscle: https://bit.ly/3eiW8FF   Nutrition and Training Bundles to Save 67%: https://bit.ly/2yaxQxa   Please Support Our Sponsors   PowerDot - Save 20%

#### Supplemental

In [586]:
p = episode_df['Podcast_Name'].to_frame()

In [588]:
p['Pod_Index'] = p['Podcast_Name'].map(pod_dict).str[0]

In [590]:
df = pd.DataFrame(ep_dist.reshape(-1,1), columns=['Cosine Similarity for Episodes'])

In [592]:
df2 = p.join(df)

In [596]:
df2['Cosine Similarity for Podcasts']= df2['Pod_Index'].apply(lambda x: pod_dist[0][x] )

In [601]:
df2['Summed Similarity'] = df2["Cosine Similarity for Episodes"] + df2["Cosine Similarity for Podcasts"]

In [617]:
df2[df2['Pod_Index']!= 4].sort_values(by=['Summed Similarity'])

Unnamed: 0,Podcast_Name,Pod_Index,Cosine Similarity for Episodes,Cosine Similarity for Podcasts,Summed Similarity
16820,Being Fuerte. It’s Time to Speak!,302,0.137119,0.019527,0.156645
10286,Non Gendered Fitness,120,0.153246,0.010033,0.163280
10260,Non Gendered Fitness,120,0.170357,0.010033,0.180390
9208,The Remote CEO Show,95,0.034548,0.175606,0.210154
10298,Non Gendered Fitness,120,0.204449,0.010033,0.214482
...,...,...,...,...,...
8326,Moments of Clarity with Tiffany,86,1.549724,0.781614,2.331338
8407,Moments of Clarity with Tiffany,86,1.549724,0.781614,2.331338
8626,Moments of Clarity with Tiffany,86,1.549724,0.781614,2.331338
8659,Moments of Clarity with Tiffany,86,1.549724,0.781614,2.331338
