# Recommender output

#### Set input & output parameters

In [183]:
raw_data_directory = "../data/"
output_directory = "../outputs/"

In [184]:
input_model_name = "lda_lsi_weighted_1_run1" #manually input the filename (no extension) of csv to be used as input

In [185]:
anime_output_only = True

#### Imports

In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances

# for progress bar on loops
from tqdm import tqdm

#### Read in Dataframe of raw data with info columns

In [187]:
lookup_df = pd.read_csv(f"{raw_data_directory}merged_nowiki_df.csv")

In [188]:
lookup_df.head(2)

Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail,tmdb_id,...,tmdb_vote_count,first_air_date,tmdb_adult_content,tmdb_poster_path,tmdb_overview,tmdb_tagline,tmdb_genres,tv_networks,tmdb_keywords,is_anime
0,Game of Thrones,/title/tt0944947/,(2011–2019),Nine noble families fight for control over the...,TV-MA,"Action, Adventure, Drama",9.2,2148311,https://m.media-amazon.com/images/M/MV5BYTRiND...,1399.0,...,20934.0,2011-04-17,False,/7WUHnWGx5OO145IRxPDUkQSh4C7.jpg,Seven noble families fight for control of the ...,Winter Is Coming,"Sci-Fi & Fantasy, Drama, Action & Adventure",HBO,"based on novel or book, kingdom, dragon, king,...",0
1,Prison Break,/title/tt0455275/,(2005–2017),"Due to a political conspiracy, an innocent man...",TV-14,"Action, Crime, Drama",8.3,548267,https://m.media-amazon.com/images/M/MV5BMTg3NT...,2288.0,...,4269.0,2005-08-29,False,/ux7OfhhrXO4FzJtuew18ShiBLq7.jpg,"Due to a political conspiracy, an innocent man...",Break in. Break out. Save your brother's life.,"Action & Adventure, Crime, Drama",FOX,"prison, prisoner, escape, brother, fugitive, c...",0


In [189]:
lookup_df.columns

Index(['name', 'href', 'years', 'imdb_description', 'pg_rating',
       'imdb_genre_tags', 'imdb_rating', 'num_votes', 'img_thumbnail',
       'tmdb_id', 'tmdb_name', 'original_name', 'original_language',
       'origin_country', 'tmdb_popularity', 'tmdb_vote_average',
       'tmdb_vote_count', 'first_air_date', 'tmdb_adult_content',
       'tmdb_poster_path', 'tmdb_overview', 'tmdb_tagline', 'tmdb_genres',
       'tv_networks', 'tmdb_keywords', 'is_anime'],
      dtype='object')

#### Read in Dataframe of modeling results

In [190]:
df = pd.read_csv(f"{output_directory}{input_model_name}.csv")
df.head()

Unnamed: 0,name,href,lda_topic0_match_pct,lda_topic1_match_pct,lda_topic2_match_pct,lda_topic3_match_pct,lda_topic4_match_pct,lsa_topic0_cos_match,lsa_topic1_cos_match,lsa_topic2_cos_match,...,forced_alt_lda_topic10_match_pct,forced_alt_lda_topic11_match_pct,forced_alt_lda_topic12_match_pct,forced_alt_lda_topic13_match_pct,forced_alt_lda_topic14_match_pct,forced_alt_lda_topic15_match_pct,forced_alt_lda_topic16_match_pct,forced_alt_lda_topic17_match_pct,forced_alt_lda_topic18_match_pct,forced_alt_lda_topic19_match_pct
0,Game of Thrones,/title/tt0944947/,0.009928,0.009928,0.960288,0.009928,0.009928,0.268433,-0.020426,-0.090606,...,0.002477,0.002477,0.002477,0.640004,0.080045,0.002477,0.002477,0.002477,0.002477,0.002477
1,Prison Break,/title/tt0455275/,0.017903,0.017903,0.928387,0.017903,0.017903,0.137642,0.094227,-0.045861,...,0.004469,0.004469,0.004469,0.617398,0.065515,0.004469,0.004469,0.004469,0.004469,0.004469
2,Vikings,/title/tt2306299/,0.025177,0.025177,0.899292,0.025177,0.025177,0.118722,-0.028396,-0.080808,...,0.006285,0.006285,0.006285,0.373787,0.081534,0.006285,0.006285,0.006285,0.006285,0.134926
3,The Boys,/title/tt1190634/,0.015854,0.015854,0.936583,0.015854,0.015854,0.295292,0.046381,-0.039057,...,0.003956,0.003956,0.003956,0.618656,0.086282,0.003956,0.003956,0.003956,0.003956,0.003956
4,The Mandalorian,/title/tt8111088/,0.018286,0.018286,0.926856,0.018286,0.018286,0.149727,-0.046886,-0.102071,...,0.004563,0.004563,0.004563,0.589268,0.152775,0.004563,0.004563,0.004563,0.004563,0.004563


In [191]:
df.shape

(2484, 37)

In [192]:
df.isna().sum()

name                                0
href                                0
lda_topic0_match_pct                0
lda_topic1_match_pct                0
lda_topic2_match_pct                0
lda_topic3_match_pct                0
lda_topic4_match_pct                0
lsa_topic0_cos_match                0
lsa_topic1_cos_match                0
lsa_topic2_cos_match                0
lsa_topic3_cos_match                0
lsa_topic4_cos_match                0
lsa_topic5_cos_match                0
lsa_topic6_cos_match                0
lsa_topic7_cos_match                0
lsa_topic8_cos_match                0
lsa_topic9_cos_match                0
forced_alt_lda_topic0_match_pct     0
forced_alt_lda_topic1_match_pct     0
forced_alt_lda_topic2_match_pct     0
forced_alt_lda_topic3_match_pct     0
forced_alt_lda_topic4_match_pct     0
forced_alt_lda_topic5_match_pct     0
forced_alt_lda_topic6_match_pct     0
forced_alt_lda_topic7_match_pct     0
forced_alt_lda_topic8_match_pct     0
forced_alt_l

In [193]:
df.fillna("",inplace=True)

#### Make lookup tables for href to name, href to years

> Because there are instances of 'name' not being unique, we use 'href' as the unique index instead.

In [194]:
href_to_name_dict = {}

for i in range(len(df)):
    href_to_name_dict[ df.loc[i,'href'] ] = df.loc[i,'name']

In [195]:
href_to_years_dict = {}

for i in range(len(lookup_df)):
    href_to_years_dict[ lookup_df.loc[i,'href'] ] = lookup_df.loc[i,'years']

In [196]:
len(href_to_name_dict)

2484

#### Select only the columns we want

In [197]:
df.columns

Index(['name', 'href', 'lda_topic0_match_pct', 'lda_topic1_match_pct',
       'lda_topic2_match_pct', 'lda_topic3_match_pct', 'lda_topic4_match_pct',
       'lsa_topic0_cos_match', 'lsa_topic1_cos_match', 'lsa_topic2_cos_match',
       'lsa_topic3_cos_match', 'lsa_topic4_cos_match', 'lsa_topic5_cos_match',
       'lsa_topic6_cos_match', 'lsa_topic7_cos_match', 'lsa_topic8_cos_match',
       'lsa_topic9_cos_match', 'forced_alt_lda_topic0_match_pct',
       'forced_alt_lda_topic1_match_pct', 'forced_alt_lda_topic2_match_pct',
       'forced_alt_lda_topic3_match_pct', 'forced_alt_lda_topic4_match_pct',
       'forced_alt_lda_topic5_match_pct', 'forced_alt_lda_topic6_match_pct',
       'forced_alt_lda_topic7_match_pct', 'forced_alt_lda_topic8_match_pct',
       'forced_alt_lda_topic9_match_pct', 'forced_alt_lda_topic10_match_pct',
       'forced_alt_lda_topic11_match_pct', 'forced_alt_lda_topic12_match_pct',
       'forced_alt_lda_topic13_match_pct', 'forced_alt_lda_topic14_match_pct',
   

In [198]:
# Technique Reference: https://stackoverflow.com/questions/21285380/find-column-whose-name-contains-a-specific-string

columnns_to_drop =[]

dropcol_prefix = "lsa_topic"
columnns_to_drop = [col for col in df if dropcol_prefix in col]

dropcol_prefix2 = "forced_alt_lda_"
columnns_to_drop += [col for col in df if dropcol_prefix2 in col]

df.drop(columns=['name']+columnns_to_drop, inplace=True)
df.head(3)

Unnamed: 0,href,lda_topic0_match_pct,lda_topic1_match_pct,lda_topic2_match_pct,lda_topic3_match_pct,lda_topic4_match_pct
0,/title/tt0944947/,0.009928,0.009928,0.960288,0.009928,0.009928
1,/title/tt0455275/,0.017903,0.017903,0.928387,0.017903,0.017903
2,/title/tt2306299/,0.025177,0.025177,0.899292,0.025177,0.025177


In [199]:
df.columns

Index(['href', 'lda_topic0_match_pct', 'lda_topic1_match_pct',
       'lda_topic2_match_pct', 'lda_topic3_match_pct', 'lda_topic4_match_pct'],
      dtype='object')

In [200]:
df.set_index('href', inplace=True)

#### Generate cosine similarity matrix

In [201]:
# Generate a matrix of cosine similarities that can be used for lookup
dists = cosine_distances(df)
cos_similarities_df = pd.DataFrame(dists, index=df.index, columns=df.index)
cos_similarities_df.head()

# Reference: General Assembly DSI Lesson 705-lesson-recommender-systems

href,/title/tt0944947/,/title/tt0455275/,/title/tt2306299/,/title/tt1190634/,/title/tt8111088/,/title/tt5180504/,/title/tt6468322/,/title/tt10919420/,/title/tt3322312/,/title/tt2193021/,...,/title/tt3804114/,/title/tt18335752/,/title/tt11771270/,/title/tt0052451/,/title/tt6106704/,/title/tt0047736/,/title/tt0068093/,/title/tt0482857/,/title/tt0058855/,/title/tt8873996/
href,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/title/tt0944947/,0.0,0.0001597553,0.000622,8.7e-05,0.0001760198,0.000123,7.634777e-05,7.190903e-05,0.000129,5.6e-05,...,0.00051,0.00057,0.002978,8.576955e-05,0.000272,0.0001603878,5.1e-05,0.00025,2.8e-05,0.000449
/title/tt0455275/,0.00016,0.0,0.000151,1.1e-05,3.941778e-07,2e-06,1.522421e-05,1.730255e-05,2e-06,2.7e-05,...,9.9e-05,0.000126,0.001759,1.141324e-05,1.5e-05,6.248796e-10,3e-05,1e-05,5.4e-05,7.3e-05
/title/tt2306299/,0.000622,0.0001511151,0.0,0.000244,0.000136074,0.000191,0.0002622623,0.0002706785,0.000184,0.000305,...,6e-06,1e-06,0.000879,0.0002455826,7.1e-05,0.0001505012,0.000316,8.3e-05,0.000386,1.4e-05
/title/tt1190634/,8.7e-05,1.10782e-05,0.000244,0.0,1.565173e-05,3e-06,3.288209e-07,6.90996e-07,4e-06,3e-06,...,0.000176,0.000212,0.002049,2.495487e-09,5.1e-05,1.124523e-05,5e-06,4.2e-05,1.6e-05,0.000141
/title/tt8111088/,0.000176,3.941778e-07,0.000136,1.6e-05,0.0,5e-06,2.051776e-05,2.291983e-05,4e-06,3.4e-05,...,8.7e-05,0.000113,0.001706,1.604949e-05,1e-05,3.63414e-07,3.7e-05,6e-06,6.4e-05,6.3e-05


## Define functions for generating results from requests

#### Function to generate a dataframe of the looked up show info, given a dataframe/list of similarity results 

In [203]:
def display_results_info(raw_results, num_results_to_show=5, anime_output_only=anime_output_only, lookup_df=lookup_df):

    display_results_df = pd.DataFrame(columns = lookup_df.columns)
    # display_results_df['match_score'] = ''
    
    if num_results_to_show > len(raw_results):
        num_results_to_show = len(raw_results)
        print(f"Warning: requested number of results exceeds number of possible outputs.")

    if isinstance(raw_results, list): 
        raw_results_indices = raw_results
    elif isinstance(raw_results, pd.DataFrame):
        raw_results_indices = raw_results.index
    else:
        print("Error- unsupported type passed to display_results function. Only takes: list or dataframe")
        return

    if anime_output_only:
        i = 0
        j = 0
        while i < len(raw_results_indices) and j < num_results_to_show:
            lookup_item = raw_results_indices[i]
            temp_row = lookup_df[lookup_df['href']== lookup_item]

            i += 1

            if anime_output_only:
                if (temp_row['is_anime']==1).bool():
                    display_results_df = pd.concat([display_results_df, temp_row])
                    j += 1
                else: 
                    # print(f"{str(temp_row[['name']])} is not anime")
                    pass
            else:
                display_results_df = pd.concat([display_results_df, temp_row])
                j += 1
    else:
        for i in range(num_results_to_show):
            lookup_item = raw_results_indices[i]
            temp_row = lookup_df[lookup_df['href']== lookup_item]

            display_results_df = pd.concat([display_results_df, temp_row])   

    if len(display_results_df) < num_results_to_show:
        print(f"Sorry- there are not {num_results_to_show} results to be shown.")

    return display_results_df

#### Function to generate a list of recommendations (most similar entries), based on cosine similarity

In [204]:
def results_from_href(href, num_results=5, anime_output_only=anime_output_only, lookup_df=lookup_df):

    if (lookup_df['href'].eq(href)).any():

        # Create & populate a df with cos_similarities and 'name' cols ('href' is index), ranked by similarity
        raw_rec_results = cos_similarities_df[href].sort_values()
        
        raw_rec_results_df = pd.DataFrame(raw_rec_results)
        raw_rec_results_df['name'] = np.nan  #need to create the blank column first, otherwise it will Error

        for i in range(len(raw_rec_results_df)):
            lookup_item = raw_rec_results_df.index[i]
            item_name = href_to_name_dict[lookup_item]

            temp_row = raw_rec_results_df.iloc[i,:].copy()
            temp_row['name'] = item_name
            raw_rec_results_df.iloc[i,:] = temp_row
        
        return display_results_info(raw_rec_results_df, num_results, anime_output_only, lookup_df) 

    else:
        print(f"(Error in results_from_href function.) No exact match found for the following href input. Please try again:\n{href}")
        return   


# Technique reference for boolean checking on search string: https://www.statology.org/pandas-check-if-column-contains-string/

#### (Helper function for getting 'href' info in proper string form)

In [205]:
def get_href_as_str(href_entry):
    '''
    A helper function, since desired output is usually a dataframe.  Casting to string adds unwanted info into it.
    
    '''

    split_list = href_entry.split("/")
    return f"/{split_list[1]}/{split_list[2]}/"

#### The "main" function, that generates full output based on user entry. (Accepts show titles or href strings as input.)

In [206]:
def get_recommendations(input_name, num_results=5, anime_output_only=anime_output_only):
    
    if (anime_output_only):
        print("Generating Anime Output Only... \n")
    else:
        print("All results types being displayed... \n")
    
    if not isinstance(input_name, str):
        print("Error: Input a string, plz")
        return
    elif "/title/" in input_name:
        try:
            temp_results = results_from_href(input_name, num_results, anime_output_only=anime_output_only)
            print(f"Displaying results for: {href_to_name_dict[input_name]} {href_to_years_dict[input_name]}. href: {input_name}\n")

            if len(temp_results) < num_results:
                print(f"Only {len(temp_results)} results found, compared to the requested {num_results}.")

            return temp_results
        except:
            return
    
    else:
        name = input_name.lower().strip()
        found_names_df = lookup_df[lookup_df['name'].str.lower().str.contains(name)]
        num_results_found = len(found_names_df)

        if num_results_found == 0:
            found_names_df = lookup_df[lookup_df['name'].str.lower().str.contains(name[:int(np.floor(len(name)/2))])]
            num_results_found = len(found_names_df)
            
            if num_results_found > 0:
                print(f"No exact matches.  {num_results_found} results starting the same found. Please copy/paste in href of desired entry from list or try again. \n")
                print(found_names_df[['name', 'years', 'href']])
            else:
                found_names_df = lookup_df[lookup_df['name'].str.lower().str.contains(name[:int(np.floor(len(name)/4))])]
                num_results_found = len(found_names_df)
                
                if num_results_found > 0:
                    print(f"No exact matches.  {num_results_found} results starting the same found. Please copy/paste in href of desired entry from list or try again. \n")
                    print(found_names_df[['name', 'years', 'href']])
                else:
                    print("No matches found starting with those characters.  Try again, focusing on the first few chars.")
            return
        elif num_results_found > 1:
            print(f"{num_results_found} results found starting with your entry. Please copy/paste in href of desired entry from list or try again. \n")
            print(found_names_df[['name', 'years', 'href']])
            return
        else:
            # print(found_names_df['href'])
            if (lookup_df['name'].eq(name)).any():
                print(f'Exact match found for "{input_name}". Generating recommendations... \n')
            else:
                print(f'1 "match" found starting with "{input_name}". Generating recommendations...')
                        
            search_href = get_href_as_str(str(found_names_df['href']))
            
            temp_results = results_from_href(search_href, num_results, anime_output_only=anime_output_only)
            print(f"Displaying results for: {href_to_name_dict[search_href]} {href_to_years_dict[search_href]}. href: {search_href}\n")

            if len(temp_results) < num_results:
                print(f"Only {len(temp_results)} results found, compared to the requested {num_results}.")

            return temp_results


# Testing out our function

In [207]:
get_recommendations("dragon ball z kai", num_results=3, anime_output_only=True)

Generating Anime Output Only... 

1 "match" found starting with "dragon ball z kai". Generating recommendations...
Displaying results for: Dragon Ball Z Kai (2009–2015). href: /title/tt1409055/



Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail,tmdb_id,...,tmdb_vote_count,first_air_date,tmdb_adult_content,tmdb_poster_path,tmdb_overview,tmdb_tagline,tmdb_genres,tv_networks,tmdb_keywords,is_anime
276,Dragon Ball Z Kai,/title/tt1409055/,(2009–2015),An HD and enhanced remaster of Dragon Ball Z.,TV-14,"Animation, Action, Adventure",8.3,23392,https://m.media-amazon.com/images/M/MV5BOWIxZG...,61709.0,...,698.0,2009-04-05,False,/mU7i4WdnBrtDKJAxU8vl41ej6Ly.jpg,Rejoin Goku and his friends in a series of cos...,,"Sci-Fi & Fantasy, Animation, Action & Adventur...",Fuji TV,"martial arts, japan, super power, shounen, anime",1
882,Ranma ½,/title/tt0096686/,(1989),A girl is involuntarily engaged to a boy who t...,TV-14,"Animation, Action, Comedy",7.9,5343,https://m.media-amazon.com/images/M/MV5BMDQ0OG...,57706.0,...,1144.0,1989-04-15,False,/lT7HqWVZoSuMKsrSiib0l6eTKXc.jpg,"Saotome Ranma, a teenage martial artist, and h...",,"Comedy, Action & Adventure, Animation, Sci-Fi ...",Fuji TV,"mixed martial arts, curse, based on manga, ani...",1
316,Blood of Zeus,/title/tt10009170/,(2020– ),"A commoner living in ancient Greece, Heron dis...",TV-MA,"Animation, Action, Adventure",7.5,19351,https://m.media-amazon.com/images/M/MV5BOWEzZm...,111111.0,...,394.0,2020-10-27,False,/zXRR5tgGLtKrRmuN4ko9SLAdCiZ.jpg,In a brewing war between the gods of Olympus a...,"When evil descends, a new myth rises.","Animation, Action & Adventure, Sci-Fi & Fantasy",Netflix,"greek mythology, gods, ancient greece, dark fa...",1


In [208]:
get_recommendations("the last of", num_results=3, anime_output_only=False)

All results types being displayed... 

1 "match" found starting with "the last of". Generating recommendations...
Displaying results for: The Last of Us (2023– ). href: /title/tt3581920/



Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail,tmdb_id,...,tmdb_vote_count,first_air_date,tmdb_adult_content,tmdb_poster_path,tmdb_overview,tmdb_tagline,tmdb_genres,tv_networks,tmdb_keywords,is_anime
11,The Last of Us,/title/tt3581920/,(2023– ),"After a global pandemic destroys civilization,...",TV-MA,"Action, Adventure, Drama",8.9,374882,https://m.media-amazon.com/images/M/MV5BZGUzYT...,100088.0,...,3198.0,2023-01-15,False,/uKvVjHNqB5VmOrdxqAt2F7J78ED.jpg,Twenty years after modern civilization has bee...,"When you're lost in the darkness, look for the...",Drama,HBO,"people smuggling, post-apocalyptic future, inf...",0
2318,EastEnders,/title/tt0088512/,(1985– ),The everyday lives of working-class inhabitant...,TV-PG,"Crime, Drama, Romance",4.7,8731,https://m.media-amazon.com/images/M/MV5BNTQ3Mj...,1871.0,...,188.0,1985-02-19,False,/z4jgyI5TpoRZiJTNchkVkMrGQyz.jpg,The everyday lives of working-class residents ...,,"Soap, Drama, Crime",BBC One,"london, england, british pub, east end of lond...",0
1355,Watchmen,/title/tt7049682/,(2019),Set in an alternate history where masked vigil...,TV-MA,"Crime, Drama, Mystery",8.2,124957,https://m.media-amazon.com/images/M/MV5BOWU3OD...,79788.0,...,1108.0,2019-10-20,False,/m8rWq3j73ZGhDuSCZWMMoE9ePH1.jpg,Set in an alternate history where superheroes ...,Nothing ever ends...,"Crime, Drama, Action & Adventure, Sci-Fi & Fan...",HBO,"superhero, vigilante, based on graphic novel",0


In [209]:
# # Calling our function by prompting user for query string

# user_input = input ("Enter a show title to search: ")

# get_recommendations(user_input)