In [3]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [4]:
df = pd.read_csv("backend/data/trunc_metadata.csv")

In [5]:
df

Unnamed: 0,show_name,show_uri,show_description,publisher,rss_link,avg_duration,category,subcategory,publish_time
0,The Grier Podcast,spotify:show:1spDjPLku0UCnir7H2hEG4,A podcast recorded and created by the students...,Bryan Beckel,https://anchor.fm/s/11399a34/podcast/rss,21.389133,Education,,2022-02-21 12:00:00
1,"""ACIM"" A Course In Miracles For The Millennium",spotify:show:5wb7vWvrCtgsY3jMvIAOXW,"""A study group for the transformation of life ...",Rafael Hidalgo,https://anchor.fm/s/25197c4/podcast/rss,87.984350,Religion & Spirituality,,2019-02-21 19:27:18
2,AfroLatina_BeautifitlyBalanced,spotify:show:3zbdWAjWe2sp91t3Ly1j49,Health+SELF Love+Spotless Clean Home! Thats ev...,The AfroLatina_BeautifitlyBalanced,https://anchor.fm/s/9571594/podcast/rss,19.906850,Health & Fitness,Mental Health,2022-07-08 03:24:43
3,Apostles of Tomorrow,spotify:show:31r4WwUvNQ0CHGU2xobnZP,"A podcast to help fellow Gen Z Apostolics,To f...",Your host Garrett,https://anchor.fm/s/ddebcac/podcast/rss,29.358917,Religion & Spirituality,Spirituality,2020-10-29 17:00:00
4,Brave Broccoli,spotify:show:7K1LCJSpW2OoDcI9PU9iyV,Just start talking and we can go from there,The brave vegetable,https://anchor.fm/s/122f9614/podcast/rss,37.889942,Comedy,Comedy Interviews,2020-01-19 01:02:16
...,...,...,...,...,...,...,...,...,...
15837,free your mind: podcast,spotify:show:2DqGCojWyryoDFq0argY9E,Sharing the journey of life & how to set your ...,Ima,https://anchor.fm/s/e94e5e0/podcast/rss,10.529867,Education,Self-Improvement,2021-03-02 03:49:00
15838,The Best Comes from the worst,spotify:show:31aNxmwRKE2zlQbYVglQxe,In order to live your best life you must exper...,James Taylor,https://anchor.fm/s/b2b7450/podcast/rss,16.416900,Religion & Spirituality,,2019-10-14 17:02:45
15839,RED CUP VIBEz,spotify:show:5omDM65kJwYhlJdAiA86fn,Sipping too many adult beverages out of a red ...,BURBERRYLYTE,https://anchor.fm/s/12949c94/podcast/rss,36.678233,Comedy,Improv,2020-01-19 10:15:58
15840,PositiveVibes 24/7,spotify:show:5ueyekcDdst5X5ntxReUz8,PositiveVibes Only,PositiveVibes,https://anchor.fm/s/26f0a70/podcast/rss,3.333292,Education,,2019-02-04 20:59:58


In [6]:
shows = df.set_index('show_name').to_dict('index')

# Create lookup dictionaries
show_name_to_index = {show_name : index for index, show_name in enumerate([show_name for show_name in shows])}
show_index_to_name = {v:k for k,v in show_name_to_index.items()}

In [7]:
docs_compressed = pickle.load( open( "backend/data/docs_compressed.p", "rb" ) )
words_compressed = pickle.load( open( "backend/data/words_compressed.p", "rb") )
tfidf_vec = pickle.load(open("backend/data/tfidf.p", "rb"))

genre_tf_idf = pickle.load(open("backend/data/genre_tf_idf_dict.p", "rb"))
publisher_tf_idf = pickle.load(open("backend/data/publisher_tf_idf_dict.p", "rb"))

In [6]:
pickle.dump(words_compressed, open( "words_compressed.p", "wb" ) )

In [24]:
words_compressed.shape

(20000, 40)

In [8]:
words_compressed = words_compressed.transpose()
words_compressed_normed = normalize(words_compressed, axis = 1)
docs_compressed_normed = normalize(docs_compressed)

In [9]:
def get_genre_tfidf(pref_list):
    tf_idf_vec = np.zeros(docs_compressed_normed.shape[1])
    
    for genre in pref_list:
        tf_idf_vec += genre_tf_idf[genre]
        
    return tf_idf_vec / len(pref_list)

In [10]:
def get_publisher_tfidf(pref_list):
    tf_idf_vec = np.zeros(docs_compressed_normed.shape[1])
    
    for publisher in pref_list:
        tf_idf_vec += publisher_tf_idf[publisher]
        
    return tf_idf_vec / len(pref_list)

In [11]:
def get_phrase_tfidf(pref_list):
    tf_idf_vec = np.zeros(docs_compressed_normed.shape[1])
    
    for phrase in pref_list:
        
        # Use V matrix from SVD to represent query in words_compressed_normed space
        words = phrase.split(" ")
        stemmed_words = [stemmer.stem(word) for word in words]
        query = ' '.join(stemmed_words)
        query_tfidf = tfidf_vec.transform([query]).toarray()
        query_vec = normalize(np.dot(query_tfidf, words_compressed)).squeeze()
        tf_idf_vec += query_vec
        
    return tf_idf_vec / len(pref_list)

In [27]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
stemmer.stem("you'll")

"you'll"

In [62]:
string = "basketball game"
words = string.split(" ")
stemmed_words = [stemmer.stem(word) for word in words]
' '.join(stemmed_words)

'basketbal game'

In [12]:
def get_podcast_tfidf(pref_list):
    tf_idf_vec = np.zeros(docs_compressed_normed.shape[1])
    
    for podcast in pref_list:
        show_idx = show_name_to_index[podcast]
        tf_idf_vec += docs_compressed_normed[show_idx, :]
        
    return tf_idf_vec / len(pref_list)

In [13]:
def get_specific_tfidf(pref_type, pref_list):
    
    if not pref_list:
        return np.zeros(docs_compressed_normed.shape[1])
    
    if pref_type == "GENRE":
        tf_idf_vec = get_genre_tfidf(pref_list)
            
    elif pref_type == "PUBLISHER":
        tf_idf_vec = get_publisher_tfidf(pref_list)
        
    elif pref_type == "PHRASE":
        tf_idf_vec = get_phrase_tfidf(pref_list)
        
    else: # pref_type is "PODCAST"
        tf_idf_vec = get_podcast_tfidf(pref_list)
                
    return tf_idf_vec

In [14]:
def get_total_tfidf(genres, publishers, phrases, podcasts):
    categories_considered = 0
    
    tf_idf_vec = np.zeros(docs_compressed_normed.shape[1])
    
    if genres:
        tf_idf_vec += get_specific_tfidf("GENRE", genres)
        categories_considered += 1
        
    if publishers:
        tf_idf_vec += get_specific_tfidf("PUBLISHER" , publishers)
        categories_considered += 1
        
    if phrases:
        tf_idf_vec += get_specific_tfidf("PHRASE" , phrases)
        categories_considered += 1
        
    if podcasts:
        tf_idf_vec += get_specific_tfidf("PODCAST" , podcasts)
        categories_considered += 1
    
    if categories_considered == 0:
        return tf_idf_vec
    
    return tf_idf_vec / categories_considered

In [29]:
def get_best_preference_match(indiv_one_pref, indiv_two_pref, rec_podcast_idx):
    """
    """
    pref_types = ["genre", "publisher", "phrase", "podcast"]
    
    avg_genre_tfidf = (get_specific_tfidf("GENRE", indiv_one_pref["genres"]) + get_specific_tfidf("GENRE", indiv_two_pref["genres"])) / 2
    avg_publisher_tfidf = (get_specific_tfidf("PUBLISHER", indiv_one_pref["publishers"]) + get_specific_tfidf("PUBLISHER", indiv_two_pref["publishers"])) / 2  
    avg_phrase_tfidf = (get_specific_tfidf("PHRASE", indiv_one_pref["phrases"]) + get_specific_tfidf("PHRASE", indiv_two_pref["phrases"])) / 2
    avg_podcast_tfidf = (get_specific_tfidf("PODCAST", indiv_one_pref["podcasts"]) + get_specific_tfidf("PODCAST", indiv_two_pref["podcasts"])) / 2
    
    recommended_tfidf = docs_compressed_normed[rec_podcast_idx, :]
    
    similarities = [avg_genre_tfidf.dot(recommended_tfidf), avg_publisher_tfidf.dot(recommended_tfidf), avg_phrase_tfidf.dot(recommended_tfidf), avg_podcast_tfidf.dot(recommended_tfidf)]
    
    max_similarity = max(similarities)
    
    for i, similarity in enumerate(similarities):
        if similarity == max_similarity:
            return (pref_types[i], max_similarity)
            result_string = "This podcast suits your combined %s preference best, with a similarity score . You are %s." % (pref_types[i], age)
    
    return (0, "")

In [34]:
get_best_preference_match(pref_1, pref_2, 12)

('GENRE', 0.014859702208444563)

In [129]:
def get_top_k_recommendations(indiv_one_pref, indiv_two_pref, k = 10):
    """
    indiv_one_pref = {
        genres = [] (string list)
        publishers = [] (string list)
        phrases = [] (string list)
        podcasts = [] (string list)
        duration = [min_duration, max_duration]
        
    }, where each list is a string list of the individual's preferences
    
    """
    indiv_one_tfidf = get_total_tfidf(indiv_one_pref["genres"], indiv_one_pref["publishers"], indiv_one_pref["phrases"], indiv_one_pref["podcasts"])
    indiv_two_tfidf = get_total_tfidf(indiv_two_pref["genres"], indiv_two_pref["publishers"], indiv_two_pref["phrases"], indiv_two_pref["podcasts"])
    
    avg_tfidf = (indiv_one_tfidf + indiv_two_tfidf) / 2
    
    # Find overlap of both user durations
    min_duration = max(indiv_one_pref["duration"][0], indiv_two_pref["duration"][0])
    max_duration = min(indiv_one_pref["duration"][1], indiv_two_pref["duration"][1])
    
    # If max_duration is 60, user is okay with any length greater than 60 as well
    if max_duration == 60:
        max_duration = float('inf')

    # Find indices of podcasts that satisfy duration constraints
    new_df = df[(df["avg_duration"] >= min_duration) & (df["avg_duration"] <= max_duration)]
    valid_idx = new_df.index.tolist()
    
    # Only calculate similarity for podcasts meeting duration
    valid_docs = docs_compressed_normed[valid_idx, :]
    similarities = valid_docs.dot(avg_tfidf)
    sorted_idx = np.argsort(similarities)[::-1]
    
    top_matches = []
    num_iterations = min(k, len(sorted_idx))

    for i in range(num_iterations):
        podcast_idx = valid_idx[sorted_idx[i]]
        top_matches.append((show_index_to_name[podcast_idx], similarities[sorted_idx[i]]))

    return top_matches

In [132]:
pref_1 = {
    
    'genres': [], 'publishers': [], 'phrases': ["baseball"], 'podcasts': [], 'duration': [0, 30]
}

pref_2 = {
    
    'genres': [], 'publishers': [], 'phrases': [], 'podcasts': [], 'duration': [20, 60]
}

In [133]:
get_top_k_recommendations(pref_1, pref_2)

[]

In [15]:
def get_top_k_recs_given_query(query, k = 10):
    """
    Params:
    {
        query: TF-IDF vector representing a query (shape of (40, ))
        k: number of recommendations returned (default = 10)
    }

    Returns: a list of k sorted tuples in format (podcast name, cosine similarity) 
    """
    similarities = docs_compressed_normed.dot(query)
    sorted_idx = np.argsort(similarities)[::-1]

    top_matches = []
    for i in range(k):
        top_matches.append((show_index_to_name[sorted_idx[i]], similarities[sorted_idx[i]]))

    return top_matches

In [None]:
query = docs_compressed_normed[1, :]

In [None]:
get_top_k_recs_given_query(query)

In [None]:
get_phrase_tfidf(["nba basketball"])