In [19]:
import pandas as pd
import numpy as np
import pickle
import joblib
import warnings
import re

warnings.filterwarnings('ignore')

In [20]:
truncated_final_df = pd.read_pickle('collab-model-final-df.pkl')
features_df = pd.read_pickle('collab-model-features-df.pkl')

features_matrix = joblib.load('collab-model-features-matrix.pkl')
model = joblib.load('collab-model.pkl')

In [21]:
imdb = pd.read_csv('imdb/title.basics.tsv', sep='\t', header=0)

In [22]:
imdb.titleType.unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [23]:
clean_imdb = imdb[(imdb['titleType']=='movie') | (imdb['titleType']=='tvMovie')]
clean_imdb = clean_imdb[['tconst','primaryTitle', 'startYear']]
clean_imdb.rename(
    columns={'tconst':'imdbId'},
    inplace=True
)

In [24]:
clean_imdb['startYear'] = clean_imdb['startYear'].astype(str)

In [25]:
clean_imdb['searchTitle'] = (
    clean_imdb['primaryTitle']
    + ' ('
    + clean_imdb['startYear']
    + ')'
)

clean_imdb['searchTitle'] = clean_imdb['searchTitle'].str.replace(
    r"[^A-Za-z0-9 ():',-].+",
    ''
    )

In [26]:
truncated_final_df['Title'] = np.where(
    truncated_final_df['title'].str.contains(', The'),
    'The ' + truncated_final_df['title'].str.replace(', The', ''),
    truncated_final_df['title']
)

In [27]:
unique_movie_list = truncated_final_df['Title'].drop_duplicates().to_list()
unique_movie_list = [i.lower() for i in unique_movie_list]

In [28]:
unique_movie_list = [
    re.sub(r"[^A-Za-z0-9 ():',-].+", '', i) for i in unique_movie_list
    ]

In [29]:
imdb_df = clean_imdb[clean_imdb.searchTitle.str.lower().isin(unique_movie_list)]
imdb_df = imdb_df[['imdbId', 'searchTitle']]
imdb_df = imdb_df.rename(columns={'searchTitle':'Title'})
imdb_df.drop_duplicates(subset='Title', keep='first', inplace=True)

#imdb_df.to_pickle('imdb-titles-df.pkl')
#truncated_final_df.to_pickle('collab-model-final-df-processed.pkl')

In [30]:
def recommend(title):
    n_movies_to_recommend = 10
    movie_list = truncated_final_df[truncated_final_df['Title'].str.contains(title, case=False)]  
    if len(movie_list):
        found_title = movie_list.iloc[0]['Title']        
        movie_idx = movie_list.iloc[0]['movieId']
        movie_idx = features_df[features_df['movieId']==movie_idx].index[0]

        distances, indices = model.kneighbors(features_matrix[movie_idx],n_neighbors=n_movies_to_recommend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        rec_movie_indices.reverse()
        
        res = []

        for val in rec_movie_indices:
          movie_idx = features_df.iloc[val[0]]['movieId']
          idx = truncated_final_df[truncated_final_df['movieId'] == movie_idx].iloc[0]['Title']
          res.append(
              {'Title':idx, 'Distance':val[1]}
          )
        #print(found_title)
        res = pd.DataFrame(res)
        res = pd.merge(res, imdb_df, how='left', on='Title')
        return res
    else:
        return "No movies found. Please try again with a different title."

In [33]:
recommend('finding dory')

Unnamed: 0,Title,Distance,imdbId
0,Moana (2016),0.457703,tt3521164
1,Zootopia (2016),0.499123,tt2948356
2,Inside Out (2015),0.55855,tt2096673
3,Fantastic Beasts and Where to Find Them (2016),0.569504,tt3183660
4,Big Hero 6 (2014),0.569944,tt2245084
5,The Jungle Book (2016),0.570164,tt3040964
6,Frozen (2013),0.573171,tt2294629
7,Monsters University (2013),0.576619,tt1453405
8,Doctor Strange (2016),0.578088,tt1211837
9,The Secret Life of Pets (2016),0.582578,tt2709768


In [42]:
truncated_final_df

Unnamed: 0,userId,movieId,rating,title,AvgRating,CountRating,Title
0,8,1,4.0,Toy Story (1995),3.893708,57309,Toy Story (1995)
1,8,3,4.0,Grumpier Old Men (1995),3.142028,11804,Grumpier Old Men (1995)
2,8,6,3.0,Heat (1995),3.854909,24588,Heat (1995)
3,8,7,1.0,Sabrina (1995),3.363666,12132,Sabrina (1995)
4,8,10,4.0,GoldenEye (1995),3.421458,28265,GoldenEye (1995)
...,...,...,...,...,...,...,...
9614712,162534,192379,3.5,First Man (2018),3.639213,686,First Man (2018)
9614713,162534,193065,3.0,Roma (2018),3.744922,837,Roma (2018)
9614714,162534,193944,3.0,The Ballad of Buster Scruggs (2018),3.669110,955,The Ballad of Buster Scruggs (2018)
9614715,162534,194400,3.0,Widows (2018),3.407609,368,Widows (2018)


In [41]:
imdb_df

Unnamed: 0,imdbId,Title
936,tt0000947,L
1037,tt0001049,G
2127,tt0002152,D
2568,tt0002595,
2700,tt0002727,B
...,...,...
8463864,tt8760684,Apollo 11 (2019)
8469033,tt8772262,Midsommar (2019)
8616017,tt9092964,Who am I
8686297,tt9243946,El Camino: A Breaking Bad Movie (2019)


In [39]:
# Save as csv files too as Streamlit appears to have issues reading pickled 
# dataframes
imdb_df.to_csv('imdb-titles-df.csv')
truncated_final_df.to_csv('collab-model-final-df-processed.csv')