In [15]:
import pickle
import pandas as pd
import numpy as np

In [16]:
import re
import string

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [18]:
import nltk
#nltk.download()

In [19]:

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.corpus import conll2000
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.chunk import ne_chunk

In [20]:
from sklearn.decomposition import NMF, TruncatedSVD

In [21]:
from sklearn.metrics import pairwise_distances

### Overview of Data

In [22]:
mh_podcasts = pd.read_pickle('mh_podcasts.pkl')

In [23]:
mh_podcasts.head()

Unnamed: 0,Podcast_Name,Ep_id,Ep_name,Ep_date,Ep_desc
0,(2020) Mental Health Explained | Created By Yo...,10JraOKEu4gb2dKQEwjhmm,Depression and Tics During Quarantine,2020-12-16,This episode helps explain the effects of quar...
1,Being African American in 2021 and dealing wit...,4Vs1ajXhg5t53zHNDpM3wu,Chipping away at the mental health stigma,2021-10-11,The Black community has made enormous contribu...
2,Being African American in 2021 and dealing wit...,6jFW6wq6Pafs0OLAlHVNRh,Being black in America in 2021,2021-10-08,With love for seven addressing mental health i...
3,Being African American in 2021 and dealing wit...,4F5RugIvvmb8uI5fDqPmhz,Surviving a Narcissistic breakup : The Fear an...,2020-12-12,Moving on and healing from an narcissistic -...
4,Being African American in 2021 and dealing wit...,4eEe5dXg47re6BjpeyZdPx,Love and mental health 2020,2020-12-09,"Love - relationship, mental health and parenti..."


In [24]:
mh_podcasts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20745 entries, 0 to 20744
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Podcast_Name  20745 non-null  object
 1   Ep_id         20745 non-null  object
 2   Ep_name       20745 non-null  object
 3   Ep_date       20745 non-null  object
 4   Ep_desc       20745 non-null  object
dtypes: object(5)
memory usage: 810.5+ KB


In [25]:
#dropping duplicate episode descriptions
mh_podcasts_unique = mh_podcasts.drop_duplicates(subset=['Ep_desc'])

In [None]:
mh_podcasts_unique.info()

In [None]:
podcast_names_df = pd.read_pickle('just_podcasts.pkl')

In [None]:
podcast_names_df.info()

In [None]:
podcast_names_df

## Corpus

In [26]:
ep_corpus = mh_podcasts_unique.Ep_desc.tolist()

In [27]:
len(ep_corpus)

18115

In [28]:
terms_length = sum([len(d.split(' ')) for d in ep_corpus])

In [29]:
terms_length

2616842

In [30]:
ep_corpus[:10]

['This episode helps explain the effects of quarantine on depression and tic disorders. ',
 'The Black community has made enormous contributions to the ongoing fight for social, racial, and economic justice.   Despite these efforts, true social justice among the Black community will remain incomplete until mental health disparities among this group are addressed.   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app',
 'With love for seven addressing mental health in the black community love marriage and emotional health',
 'Moving on and healing from an narcissistic   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app',
 'Love - relationship, mental health and parenting all during a pandemic   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app',
 'What is the operating system that defines truly successful people? In interviewing 

### Cleaning

In [63]:
import string
def clean_regex(series):
    # remove digits
    desc = series.apply(lambda x: re.sub('\d', ' ', x ))
    # remove \xa0 from string in Python: https://stackoverflow.com/questions/10993612/how-to-remove-xa0-from-string-in-python
    desc = desc.apply(lambda x: x.replace(u'\xa0', u''))
    #remove the | and > symbols
    desc = desc.apply(lambda x: re.sub('\|.+', ' ', x))
    desc = desc.apply(lambda x: re.sub('\>.+', ' ', x))
    #remove websites and info that comes after (seems like sponsorship)
    desc = desc.apply(lambda x: re.sub('http.+', ' ', x))
    desc = desc.apply(lambda x: re.sub('www.+', ' ', x))
    #add in space before capital letters if none (some are combined together): referred https://stackoverflow.com/questions/199059/a-pythonic-way-to-insert-a-space-before-capital-letters)
    desc = desc.apply(lambda x: re.sub("([A-Z])(?![A-Z])", r"\1", x))
    #remove punctuation
    desc = desc.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    #remove non-ASCII characters: https://stackoverflow.com/questions/20889996/how-do-i-remove-all-non-ascii-characters-with-regex-and-notepad
    desc = desc.apply(lambda x: re.sub('[^\x00-\x7F]+', '', x))
    return desc

In [64]:
cleaned = clean_regex(mh_podcasts_unique.Ep_desc)

In [None]:
cleaned[20740]

In [65]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [66]:
cleaned

0        This episode helps explain the effects of quar...
1        The Black community has made enormous contribu...
2        With love for seven addressing mental health i...
3        Moving on and healing from an narcissistic    ...
4        Love  relationship mental health and parenting...
                               ...                        
20740    By PR Sarkar founder of Ananda MargaDiscourse ...
20741    By PR Sarkar founder of Ananda MargaDiscourse ...
20742    By PR Sarkar founder of Ananda MargaPublished ...
20743    Discourse given by Prabhat Ranjan Sarkar onJan...
20744    Discourse given by Prabhat Ranjan Sarkar onNov...
Name: Ep_desc, Length: 18115, dtype: object

In [67]:
#use spacy to locate proper nouns and turn them into a list of tuples (to later pass in as compound words)
def find_compound(series):
    named_entities_set = set()
    single_names = set() #also going to remove single names (just first/last name) later
    doc = list(nlp.pipe(series))
    for sent in doc:
        for ent in sent.ents:
            split = ent.text.split()
            if len(split) > 1:
                named_entities_set.add(tuple(split))
            else:
                if ent.label_ == 'PERSON':
                    single_names.add(ent.text)
    return named_entities_set, single_names

In [68]:
ne_set, single_names = find_compound(cleaned)

In [69]:
ne_list = list(ne_set)

In [70]:
len(ne_list)

26687

In [38]:
single_names #downside of removing these is that there are some celebrities/artists (guests on podcasts?) with single name: Lauv, Shakespeare, LGBTQ

{'Eventbrite  ',
 'Wearing',
 'Kettlebell',
 'Ástmar',
 'Goals',
 'Grayson',
 'Talk',
 'Nighthawk',
 'Recipes',
 'Bender',
 'Nirvana',
 'Gulamhusein',
 'Josie',
 'Bowie',
 'Atheer',
 'Clutter',
 'Episode     ',
 'Malcolm',
 'Pinoy',
 'Breakthrough',
 'Alexs',
 'Albuquerque',
 'PYCC',
 'Jena',
 'Marty  ',
 'Devil',
 'Joe  ',
 'Gilbert',
 'Bags',
 'Gospels',
 'Christine',
 'Phentermine',
 'Ikigai',
 'NC',
 'Easy',
 'Mairéad',
 'Bit',
 'Rhino',
 'Sopel',
 'Maggie',
 'Grind',
 'Casey',
 'Sveta',
 'Bharatanatyam',
 'Autumns',
 'Grace',
 'Kerry',
 'Burlesque',
 'Joey',
 'Derz',
 'Gavin',
 'Bob',
 'Sahin',
 'Redefine',
 'Mario',
 'Trey',
 'Practitioner',
 'Ivory',
 'Libby',
 'Horsburgh',
 'Dick',
 'Pentacles       ',
 'Everybodys',
 'Greg',
 'NOW',
 'Marilee',
 'Omar',
 'Watch',
 'Enabling',
 'Whar',
 'Paleo',
 'Jackies',
 'Lekan',
 'Tonglen',
 'Osman',
 'McAdams',
 'Zaidy',
 'bush',
 'Mariostarted',
 'Kelly',
 'Susie',
 'Hejmanowski',
 'Cory',
 'Corinna',
 'Anika',
 'Kaden',
 'Bruins',
 'Ava

In [71]:
#to add into stop words later
single_names_list = list(single_names)

In [72]:
#selectively removing some from single_names_list that I saw don't belong (want to keep)
single_names_list.remove('Shakespeare')
single_names_list.remove('Lauv')
single_names_list.remove('LGBTQ')
single_names_list.remove('Bisexual')
single_names_list.remove('Covid')

In [73]:
#saving single_names list into pickle as backup
open_file = open('singlename_list.pkl', "wb")
pickle.dump(single_names_list, open_file)
open_file.close()

In [74]:
#saving compound names list into pickle as backup
open_file = open('ne_list.pkl', "wb")
pickle.dump(ne_list, open_file)
open_file.close()

In [75]:
mwe_tokenizer = MWETokenizer(ne_list)

In [76]:
def make_tokens(single_desc, tokenizer):
    mwe_tokens = tokenizer.tokenize(word_tokenize(single_desc))
    return mwe_tokens

In [77]:
tokenized_sents = cleaned.apply(lambda x: make_tokens(x, mwe_tokenizer))

In [None]:
tokenized_sents

In [78]:
#join back to string
sents = tokenized_sents.apply(lambda x: " ".join(x))

In [None]:
sents

In [79]:
lowercased = sents.apply(lambda x: x.lower())

In [None]:
lowercased[1000]

In [None]:
#helper function to add nouns a second time (upweighing nouns), given string 
def dup_nouns(string_words):
    tokens = pos_tag(word_tokenize(string_words))
    li = []
    for token in tokens:
        li.append(token[0])
        if (token[1] == 'NN') or (token[1]=='NNS'):
            li.append(token[0])
    return li

In [None]:
nouns_doubled = lowercased.apply(dup_nouns)

In [None]:
nouns_doubled

In [80]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer 
 

In [81]:
lemmatizer = WordNetLemmatizer()
pstemmer = PorterStemmer()
lstemmer = LancasterStemmer()
sstemmer = SnowballStemmer("english")

In [155]:
#will go with lemmatizer first (more conservative approach)
#given a list of words (each item in our 'cleaned' list), lemmatize each word 
def lem(words):
    new_list=[]
    for word in words:
        new_list.append(lemmatizer.lemmatize(word))
    return new_list

In [None]:
lem(nouns_doubled[23])

In [156]:
#if not doubling nouns:
def lem2(words):
    word_list = word_tokenize(words)
    new_list=[]
    for word in word_list:
        if word not in single_names_list:
            new_list.append(lemmatizer.lemmatize(word))
    return new_list

In [157]:
#full_cleaned = nouns_doubled.map(lem)
full_cleaned = lowercased.apply(lem2)

In [158]:
mh_podcasts_unique['Desc_Processed'] = full_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mh_podcasts_unique['Desc_Processed'] = full_cleaned


In [239]:
mh_podcasts_unique.reset_index(inplace=True)

In [241]:
#saving this version into pickle
mh_podcasts_unique.to_pickle('mh_podcasts_unique.pkl')

In [159]:
mh_podcasts_unique['Ep_desc'][1]

'The Black community has made enormous contributions to the ongoing fight for social, racial, and economic justice.   Despite these efforts, true social justice among the Black community will remain incomplete until mental health disparities among this group are addressed.   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app'

In [160]:
mh_podcasts_unique['Desc_Processed'][1]

['the',
 'black',
 'community',
 'ha',
 'made',
 'enormous',
 'contribution',
 'to',
 'the',
 'ongoing',
 'fight',
 'for',
 'social',
 'racial',
 'and',
 'economic',
 'justice',
 'despite',
 'these',
 'effort',
 'true',
 'social',
 'justice',
 'among',
 'the',
 'black',
 'community',
 'will',
 'remain',
 'incomplete',
 'until',
 'mental',
 'health',
 'disparity',
 'among',
 'this',
 'group',
 'are',
 'addressed',
 'this',
 'episode',
 'is',
 'sponsored',
 'by',
 'anchor',
 'the',
 'easiest',
 'way',
 'to',
 'make',
 'a',
 'podcast']

# Vectorizer

In [161]:
from nltk.corpus import stopwords
default_stop = stopwords.words('english')
custom_stop = ["twitter", "instagram", "follow", "youtube", "spotify", "check", 'help', 'ha', 'episode', 'thing', "like", "one", "podcast", "also", 'facebook']
single_names = [name.lower() for name in single_names_list]
#my full list of stop words
full_list = default_stop + custom_stop + single_names

In [52]:
#just checking words in original stop list
'by' in default_stop

True

In [176]:
corpus = mh_podcasts_unique['Desc_Processed'].apply(lambda x: " ".join(x))

In [177]:
cv = CountVectorizer(stop_words=full_list, min_df=3, max_df=0.8, ngram_range=(1,2))

In [178]:
doc_term = cv.fit_transform(corpus)



In [179]:
dtm = pd.DataFrame(doc_term.toarray(), columns = cv.get_feature_names_out())
### OUTPUT THE CONTENTS OF THE DATAFRAME
dtm

Unnamed: 0,______,______ web,_________,_________ the_wild_soul,__________________________________,__________________________________ please,___________________________________,_brilliant_thoughts,_brilliant_thoughts information,_cry,...,zodiac,zodiac sign,zombie,zone,zoning,zoo,zoom,zoom call,zoom session,zoom workout
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18111,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
top_words_dict = {k: v for k, v in sorted(cv.vocabulary_.items(), key=lambda x: x[1], reverse=True)}

In [181]:
top_words_dict

{'zoom workout': 48242,
 'zoom session': 48241,
 'zoom call': 48240,
 'zoom': 48239,
 'zoo': 48238,
 'zoning': 48237,
 'zone': 48236,
 'zombie': 48235,
 'zodiac sign': 48234,
 'zodiac': 48233,
 'zip code': 48232,
 'zip': 48231,
 'zion': 48230,
 'zero': 48229,
 'zencastrcompricing try': 48228,
 'zencastrcompricing': 48227,
 'zencastr_visit zencastrcompricing': 48226,
 'zencastr_visit': 48225,
 'zen_oxherding image': 48224,
 'zen_oxherding': 48223,
 'zen tradition': 48222,
 'zen story': 48221,
 'zen master': 48220,
 'zen koan': 48219,
 'zen': 48218,
 'zeitgeist': 48217,
 'zay': 48216,
 'zach_bush': 48215,
 'yup': 48214,
 'ysa service': 48213,
 'ysa bishop': 48212,
 'ysa assignment': 48211,
 'ysa': 48210,
 'yr': 48209,
 'youyou': 48208,
 'youwhy': 48207,
 'youwhat': 48206,
 'youve working': 48205,
 'youve wondered': 48204,
 'youve wanted': 48203,
 'youve waiting': 48202,
 'youve told': 48201,
 'youve struggled': 48200,
 'youve seen': 48199,
 'youve put': 48198,
 'youve probably': 48197,
 

In [182]:
#TF-IDF
tfidf_vec = TfidfVectorizer(stop_words=full_list, min_df=3, max_df=0.8)

In [183]:
doc_term_tfidf = tfidf_vec.fit_transform(corpus)



In [184]:
dtm_tfidf = pd.DataFrame(doc_term_tfidf.toarray(), columns = tfidf_vec.get_feature_names_out())

dtm_tfidf

Unnamed: 0,______,_________,__________________________________,___________________________________,_brilliant_thoughts,_cry,_nikeshmurali,a_bad_day,a_big_week,a_burn_boot_camp,...,zencastrcompricing,zero,zion,zip,zodiac,zombie,zone,zoning,zoo,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Topic Modeling

#### Trying NMF first, using the vectorized data from CountVectorizer and TF-IDF Vectorizer (to compare)

CV Vectorizer with NMF

In [243]:
#NMF 
nmf_act = NMF(10, init = 'nndsvda')

In [None]:
nmf = nmf_act.fit(dtm)

In [None]:
# Function to display the top n terms in each topic- sourced code from Metis
def display_topics(model, feature_names, no_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, no_top_words

In [None]:
# output contents for each topic - Count Vectorizer with NMF

output = display_topics(nmf, cv.get_feature_names_out(), 10)
output;

In [None]:
doc_topic = nmf.transform(dtm)

In [None]:
doc_topic_df = pd.DataFrame(doc_topic.round(3))

In [None]:
doc_topic_df.head()

In [None]:
doc_topic_df[doc_topic_df[2] > 1]

In [None]:
ep_corpus[6431]

CV Vectorizer with NMF

In [None]:
nmf_tfidf = nmf_act.fit(dtm_tfidf)

In [None]:
# output contents for each topic - TF-IDF Vectorizer with NMF
output = display_topics(nmf_tfidf, tfidf_vec.get_feature_names_out(), 10)
output;

#### Trying LSA now, using the vectorized data from CountVectorizer and TF-IDF Vectorizer (to compare)

CV Vectorizer with LSA

In [187]:
#LSA 
lsa_act = TruncatedSVD(n_components=15, n_iter=8)

In [None]:
lsa = lsa_act.fit(dtm)

In [None]:
# output contents for each topic - Count Vectorizer with LSA

output = display_topics(lsa, cv.get_feature_names_out(), 10)
output;

In [None]:
#turning to doc-topic matrix for recommender
doc_topic = lsa.transform(dtm)

In [None]:
doc_topic_df = pd.DataFrame(doc_topic.round(3))

TF-IDF Vectorizer with LSA

In [189]:

lsa_tfidf = lsa_act.fit(dtm_tfidf)

In [190]:
# output contents for each topic - TF-IDF Vectorizer with LSA
output = display_topics(lsa_tfidf, tfidf_vec.get_feature_names_out(), 10)
output;


Topic  1
date, life, created, first, story, wa, get, dating, mental, time

Topic  2
life, mental, wa, get, way, people, time, make, join, find

Topic  3
sanctuary, ssanrakkha_buddhist, guided, meditation, awareness, open, hmr, mindful, yasm, aggacitta_ssanrakkha_buddhist

Topic  4
mental, factor, delve, condition, bobby_temps, thrives, brainchild, thursday, managing, manage

Topic  5
full_ep, surprise, host, abbie_chatfield, oh, dear, the_edge, nat_penfold, brooklyn_ross, kyle_jackie

Topic  6
contain, violence, may, drug, message, voice, indian, sex, indiannoir, classifiable

Topic  7
anchor, easiest, make, way, contain, mental, violence, drug, may, mind

Topic  8
healing, card, the_tarot, the_wild_soul, soul_tarot, honoring, work, tribe, called, medicine

Topic  9
information, privacy, omnystudiocomlistener, see, supporter, become, national_gratitude_month, visit, acastcomprivacy, optout

Topic  10
supporter, become, get, let, feel, review, find, know, card, free

Topic  11
join, gr

In [191]:
#turning to doc-topic matrix for recommender
doc_topic = lsa_tfidf.transform(dtm_tfidf)

In [192]:
doc_topic_df = pd.DataFrame(doc_topic.round(3))

#### Saving preferred doc-topic matrix (to combine with podcast topic modeling later)

Using TF-IDF with LSA: doc-topic matrix with 15 topics for more granularity

In [218]:
#saving doc_topic_df into pickle
doc_topic_df.to_pickle('doc_topic2.pkl')

In [219]:
doc_topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.000,0.028,-0.000,0.009,-0.000,-0.006,0.015,0.004,-0.008,-0.017,0.003,-0.002,-0.014,-0.002,0.007
1,0.002,0.114,-0.002,-0.009,-0.000,-0.009,0.216,0.018,-0.025,-0.044,0.049,-0.058,-0.011,-0.015,0.015
2,0.002,0.118,-0.004,0.070,-0.000,-0.015,-0.013,0.019,-0.045,-0.004,-0.010,-0.057,0.017,0.044,-0.009
3,0.002,0.117,-0.002,-0.135,-0.001,-0.103,0.527,0.147,-0.022,-0.029,-0.032,-0.027,0.041,-0.016,0.003
4,0.003,0.201,-0.005,-0.042,-0.001,-0.122,0.575,0.055,-0.049,-0.080,-0.059,-0.041,0.009,-0.002,-0.003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18110,0.001,0.011,-0.000,-0.002,-0.000,-0.000,-0.004,0.002,-0.003,-0.008,-0.003,-0.006,-0.003,-0.003,0.001
18111,0.001,0.009,-0.000,-0.002,-0.000,-0.000,-0.004,0.002,-0.003,-0.007,-0.002,-0.005,-0.002,-0.002,0.000
18112,0.000,0.013,-0.000,-0.002,-0.000,-0.000,-0.006,0.002,-0.003,-0.009,-0.002,-0.007,-0.002,-0.007,-0.005
18113,0.001,0.062,-0.001,-0.018,-0.000,0.002,-0.014,-0.004,-0.011,-0.011,0.003,0.003,-0.003,-0.014,0.010


In [204]:
dist = pairwise_distances(np.array(doc_topic[1000]).reshape(1,-1), doc_topic, metric = 'cosine')

In [205]:
dist.argsort()[0][1:10]

array([1007,  638,  997, 1006, 1003,  990, 1004, 1009, 3939])

In [195]:
dist

array([[2.77476989e-01, 1.11022302e-16, 6.81156450e-01, ...,
        8.15402629e-01, 7.12887586e-01, 7.31555484e-01]])

In [242]:
mh_podcasts_unique['Ep_desc'][1007]

"Paul Chek is an internationally-renowned expert in the fields of corrective and high-performance exercise kinesiology. \xa0 For over 20 years, Chek’s unique, holistic health approach to treatment and education has transformed the lives of countless men and women through programs like the P~P~S Success Mastery Coaching Program.  Not surprisingly, Chek is sought after as an international presenter and consultant for successful organizations like the Chicago Bulls, Australia’s Canberra Raiders, and the U.S. Air Force Academy. \xa0 In this episode, we learn about Chek’s life and training philosophies, the last 4 doctors you’ll ever need, working in vs. working out, the 7 primary movements, and much more. Enjoy! \xa0 Enjoy! \xa0 -Mike, Doug and Anders \xa0 ------------------------------------------------------------------------------------------------------------ \xa0 Please support our partners! \xa0 Thrive Market is a proud supporter of us here at Barbell Shrugged. \xa0We very much appre

In [237]:
doc_topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.000,0.028,-0.000,0.009,-0.000,-0.006,0.015,0.004,-0.008,-0.017,0.003,-0.002,-0.014,-0.002,0.007
1,0.002,0.114,-0.002,-0.009,-0.000,-0.009,0.216,0.018,-0.025,-0.044,0.049,-0.058,-0.011,-0.015,0.015
2,0.002,0.118,-0.004,0.070,-0.000,-0.015,-0.013,0.019,-0.045,-0.004,-0.010,-0.057,0.017,0.044,-0.009
3,0.002,0.117,-0.002,-0.135,-0.001,-0.103,0.527,0.147,-0.022,-0.029,-0.032,-0.027,0.041,-0.016,0.003
4,0.003,0.201,-0.005,-0.042,-0.001,-0.122,0.575,0.055,-0.049,-0.080,-0.059,-0.041,0.009,-0.002,-0.003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18110,0.001,0.011,-0.000,-0.002,-0.000,-0.000,-0.004,0.002,-0.003,-0.008,-0.003,-0.006,-0.003,-0.003,0.001
18111,0.001,0.009,-0.000,-0.002,-0.000,-0.000,-0.004,0.002,-0.003,-0.007,-0.002,-0.005,-0.002,-0.002,0.000
18112,0.000,0.013,-0.000,-0.002,-0.000,-0.000,-0.006,0.002,-0.003,-0.009,-0.002,-0.007,-0.002,-0.007,-0.005
18113,0.001,0.062,-0.001,-0.018,-0.000,0.002,-0.014,-0.004,-0.011,-0.011,0.003,0.003,-0.003,-0.014,0.010
