### MoviEeze - Data Analysis and Model Creation
* Todd McCullough [Git](https://github.com/tamccullough)

In [1]:
import numpy as np
import pandas as pd
import heapq
from math import floor

##### Import Surprise
[Surprise](http://surpriselib.com/) is a Python scikit building and analyzing recommender systems that deal with explicit rating data.

In [2]:
from surprise import Reader, Dataset
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate

In [3]:
import pickle
filename = 'model/moviEeze_recommender_model_2010_up.sav'
ml_model = pickle.load(open(filename, 'rb'))

##### Import Data

In [4]:
movies = pd.read_csv('datasets/movies_2010.csv')
ratings = pd.read_csv('datasets/ratings_2010.csv')
links = pd.read_csv('datasets/links_2010.csv')
genre_image_list = pd.read_csv('datasets/genre_image_list.csv')
genre_image_list = genre_image_list['image'].values

In [5]:
genre_image_list

array(['actionadventure.jpg', 'actioncomedy.jpg',
       'actioncrimethriller.jpg', 'actiondrama.jpg', 'action.jpg',
       'actionthriller.jpg', 'adventure.jpg', 'animationaction.jpg',
       'animation.jpg', 'childrendrama.jpg', 'comedyaction.jpg',
       'comedyanimation.jpg', 'comedydrama.jpg', 'comedydramaromance.jpg',
       'comedyhorror.jpg', 'comedy.jpg', 'comedyromance.jpg',
       'crimedrama.jpg', 'crimedramathriller.jpg', 'crimethriller.jpg',
       'documentary.jpg', 'dramaaction.jpg', 'dramahorrorthriller.jpg',
       'drama.jpg', 'dramamysterythriller.jpg', 'dramaromance.jpg',
       'dramathriller.jpg', 'dramawar.jpg', 'fantasy.jpg',
       'horroraction.jpg', 'horrordrama.jpg', 'horror.jpg',
       'horrorthriller.jpg', 'romance.jpg', 'sci-fi.jpg', 'thriller.jpg'],
      dtype=object)

### Inference

The main function to run the model and get inferences

In [6]:
def get_r(user_id):
    # Select which system to use. Due to memory constraints, item based is the only viable option
    recommender_system = ml_model
    # N will represent how many items to recommend
    N = 2000

    # The setting to a set and back to list is a failsafe.
    rated_items = list(set(ratings.loc[ratings['userId'] == user_id]['movieId'].tolist()))
    ratings_list = movies['movieId'].values.tolist()
    reduced_ratings = ratings.loc[ratings['movieId'].isin(ratings_list)].copy()

    # Self explanitory name
    all_item_ids = list(set(reduced_ratings['movieId'].tolist()))

    # New_items just represents all the items not rated by the user
    new_items = [x for x in all_item_ids if x not in rated_items]

    # Estimate ratings for all unrated items
    predicted_ratings = {}
    for item_id in new_items:
        predicted_ratings[item_id] = recommender_system.predict(user_id, item_id).est
        pass

    # Get the item_ids for the top ratings
    recommended_ids = heapq.nlargest(N, predicted_ratings, key=predicted_ratings.get)
    recommended_ids = sorted(recommended_ids)

    # predicted_ratings
    recommended_df = movies.loc[movies['movieId'].isin(recommended_ids)].copy()
    #recommended_df.insert(1, 'pred_rating', np.zeros(len(recommended_ids)))
    recommended_df.insert(1, 'pred_rating', 0)

    # recommended_df = movies.copy()
    for idx,item_id in enumerate(recommended_ids):
        recommended_df.iloc[idx, recommended_df.columns.get_loc('pred_rating')] = int(predicted_ratings[item_id])
        pass
    return recommended_df.head(N).sort_values('pred_rating', ascending=False)

def cap_str(item):
    string = item
    return string.capitalize()

In [66]:
def reg_frame(f_list,words):
    regex_q = ''
    for word in words:
        word = word.strip() 
        if word == 'sci-fi': # get a Upper version of hypenated words
            word = 'Sci-Fi'
            word = f'(?=.*{word})' # place the word in a regex query
            regex_q += word
        elif word == 'film-noir':
            word = 'Film-Noir'
            word = f'(?=.*{word})' # place the word in a regex query
            regex_q += word
        else:
            word = cap_str(word) # Uppercase the first letter
            word = f'(?=.*{word})' # place the word in a regex query
            regex_q += word
    regex_q
    f_list = f_list[f_list['genres'].str.contains(fr'^\b{regex_q}\b',regex=True)]
    return f_list

In [8]:
def set_up_ml(user_id,genre_list):
    words = genre_list.split(',')
    for word in words:
        if word == 'Comedy' or word == 'Drama' or word == 'Horror' or word == 'Thriller' or word == 'Documentary':
            genre = word
            words.remove(word)
            words.insert(0, genre)
        if word == 'comedy' or word == 'drama' or word == 'horror' or word == 'thriller' or word == 'documentary':
            genre = word
            words.remove(word)
            words.insert(0, genre)
        else:
            pass
    film_list = get_r(user_id)
    film_list = reg_frame(film_list,words)
    film_list.pop('date')
    return film_list

In [59]:
def get_final_recommendation(list_1,list_2,list_3): # combine all recommendations
    film_recommendation = pd.DataFrame()
    film_recommendation = pd.concat([list_1,list_2,list_3]) # concat lists
    film_recommendation = film_recommendation.sort_values('pred_rating',ascending=False) # sort by predicted rating
    film_recommendation.pop('pred_rating') # drop the rating column
    film_recommendation = film_recommendation.reset_index()
    film_recommendation.pop('index') # reset and pop the old index
    a = []
    b = []
    for i in range(0,film_recommendation.shape[0]): # iterate through the dataframe and get the appropriate link for each movie
        link = links[links['movieId'] == film_recommendation.iloc[i]['movieId']]
        link = 'https://www.themoviedb.org/movie/' + str(int(link.iloc[0][2])) # append the link to the array
        a.append(link)
        genres = film_recommendation.iloc[i]['genres']
        genres = genres.split('|')
        for genre in genres: # get the most popular categories and place them first in the list
            genre = str(genre)
            if genre == 'Comedy' or genre == 'Drama' or genre == 'Horror' or genre == 'Animation':
                genre = genre
                genres.remove(genre)
                genres.insert(0, genre)
            else:
                pass
        for genre in genres: # Comedy is the most popular, so it always needs to be first
            genre = str(genre)
            if genre == 'Comedy':
                genre = genre
                genres.remove(genre)
                genres.insert(0, genre)
            else:
                pass
        if len(genres) == 1:
            genre1 = genres[0]
            genre = genre1.lower() + '.jpg'
        else:
            genre1 = genres[0]
            genre2 = genres[1]
            genre = genre1.lower() + genre2.lower() + '.jpg'
        b.append(genre)
    film_recommendation['link'] = a # add the array to the dataframe
    film_recommendation['image'] = b
    film_recommendation.pop('movieId')
    film_recommendation = film_recommendation.drop_duplicates() # drop recommended duplicates of films
    film_recommendation = film_recommendation.reset_index()
    film_recommendation.pop('index')
    for i in range(0,film_recommendation.shape[0]):
        image = film_recommendation.iloc[i]['image']
        if image in genre_image_list:
            pass
        else:
            genres = film_recommendation.iloc[i]['genres']
            genres = genres.split('|')
            for genre in genres: # get the most popular categories and place them first in the list
                genre = str(genre)
                if genre == 'Comedy' or genre == 'Drama' or genre == 'Horror' or genre == 'Animation' or genre == 'Action' or genre == 'Romance' or genre == 'Documentary':
                    genre = genre + '.jpg'
                    film_recommendation.at[i,'image'] = genre.lower()
                    break
                else:
                    pass
    return film_recommendation

### Get a Recommendation Based on Genres
The final code that will be impletented in a cleaner fashion through the browser interface.

In [48]:
users = pd.read_csv('datasets/users.csv')
users = users.users.values

In [50]:
users

array([   187,    548,    606, ..., 162286, 162334, 162516])

In [51]:
import random

In [60]:
random.choice(users)

60778

In [61]:
genre_list = 'sci-fi'

In [70]:
user_1 = random.choice(users)
user_2 = random.choice(users)
user_3 = random.choice(users)

In [71]:
recommended_list_1 = set_up_ml(user_1,genre_list) # generate a list of recommendations for each user
recommended_list_2 = set_up_ml(user_2,genre_list)
recommended_list_3 = set_up_ml(user_3,genre_list)

In [72]:
final_recommendation = get_final_recommendation(recommended_list_1,recommended_list_2,recommended_list_3)

In [73]:
final_recommendation

Unnamed: 0,title,genres,link,image
0,Avengers: Infinity War - Part II (2019),Action|Adventure|Sci-Fi,https://www.themoviedb.org/movie/299534,actionadventure.jpg
1,Spider-Man: Into the Spider-Verse (2018),Action|Adventure|Animation|Sci-Fi,https://www.themoviedb.org/movie/324857,animationaction.jpg
2,Blade Runner 2049 (2017),Sci-Fi,https://www.themoviedb.org/movie/335984,sci-fi.jpg
3,Upgrade (2018),Action|Comedy|Horror|Sci-Fi|Thriller,https://www.themoviedb.org/movie/500664,comedyhorror.jpg
4,Ant-Man and the Wasp (2018),Action|Adventure|Comedy|Fantasy|Sci-Fi,https://www.themoviedb.org/movie/363088,comedyaction.jpg
...,...,...,...,...
188,Predators (2010),Action|Sci-Fi|Thriller,https://www.themoviedb.org/movie/34851,action.jpg
189,Transformers: Age of Extinction (2014),Action|Adventure|Sci-Fi,https://www.themoviedb.org/movie/91314,actionadventure.jpg
190,Divergent (2014),Adventure|Romance|Sci-Fi|IMAX,https://www.themoviedb.org/movie/157350,romance.jpg
191,Resident Evil: Afterlife (2010),Action|Horror|Sci-Fi|Thriller|IMAX,https://www.themoviedb.org/movie/35791,horroraction.jpg
