### MoviEeze - Data Analysis and Model Creation
* Todd McCullough [Git](https://github.com/tamccullough)

In [1]:
import numpy as np
import pandas as pd
import heapq
from math import floor

##### Import Surprise
[Surprise](http://surpriselib.com/) is a Python scikit building and analyzing recommender systems that deal with explicit rating data.

In [2]:
from surprise import Reader, Dataset
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate

##### Import Data

In [3]:
ratings_full = pd.read_csv('/home/todd/Documents/v-envs/ml-25m/ratings.csv')

In [4]:
ratings_full.shape

(25000095, 4)

In [5]:
ratings_full.tail(5)

Unnamed: 0,userId,movieId,rating,timestamp
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434
25000094,162541,63876,5.0,1240952515


In [6]:
movies_full = pd.read_csv('/home/todd/Documents/v-envs/ml-25m/movies.csv')
tags = pd.read_csv('/home/todd/Documents/v-envs/ml-25m/tags.csv')

In [7]:
links_full = pd.read_csv('/home/todd/Documents/v-envs/ml-25m/links.csv')

#### Data Analysis
dataset by [grouplens](https://grouplens.org/datasets/movielens/)

In [8]:
movies_full.head(2)

Unnamed: 0,movieId,title,genres,date
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2.0,Jumanji (1995),Adventure|Children|Fantasy,1995


In [9]:
movies_full.shape

(62425, 4)

In [10]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [11]:
tags.shape

(1093360, 4)

In [12]:
links_full.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [13]:
links_full.shape

(62423, 3)

In [14]:
movies = movies_full[movies_full['date'] >= 2010]
movies = movies.reset_index()
movies.pop('index')
movies.shape

(20489, 4)

In [15]:
movies.head(5)

Unnamed: 0,movieId,title,genres,date
0,73268.0,Daybreakers (2010),Action|Drama|Horror|Thriller,2010
1,73319.0,Leap Year (2010),Comedy|Romance,2010
2,73321.0,"Book of Eli, The (2010)",Action|Adventure|Drama,2010
3,73744.0,If You Love (Jos rakastat) (2010),Drama|Musical|Romance,2010
4,73929.0,Legion (2010),Action|Fantasy|Horror|Thriller,2010


In [16]:
no_g = movies[movies['genres'] == '(no genres listed)']
no_g

Unnamed: 0,movieId,title,genres,date
3179,116046.0,Christmas Oranges (2012),(no genres listed),2012
3183,116126.0,Complicit (2013),(no genres listed),2013
3852,122571.0,The Wrong Woman (2013),(no genres listed),2013
3853,122573.0,Expecting Amish (2014),(no genres listed),2014
3875,122888.0,Ben-hur (2016),(no genres listed),2016
...,...,...,...,...
20469,209037.0,Our Wonderful Nature - The Common Chameleon (2...,(no genres listed),2016
20472,209051.0,Jeff Garlin: Our Man in Chicago (2019),(no genres listed),2019
20474,209063.0,The Prep School Negro (2012),(no genres listed),2012
20483,209133.0,The Riot and the Dance (2018),(no genres listed),2018


In [17]:
no_tag = tags[tags['movieId'] == int(122888.0) ]
no_tag

Unnamed: 0,userId,movieId,tag,timestamp
157475,6550,122888,ancient rome,1527450145
157476,6550,122888,betrayal,1527450145
157477,6550,122888,vengeance,1527450145
324310,21096,122888,based on a book,1454332900
324311,21096,122888,revenge,1454332900
457712,46290,122888,Silvia,1488015208
711226,92018,122888,Biblical,1525755580
711227,92018,122888,Changed Ending,1525755542
711228,92018,122888,Remake,1525755500
711229,92018,122888,revenge,1525755494


In [18]:
genres = pd.DataFrame(movies.genres,columns=['genres'])
genres['count'] = 1

In [19]:
genres

Unnamed: 0,genres,count
0,Action|Drama|Horror|Thriller,1
1,Comedy|Romance,1
2,Action|Adventure|Drama,1
3,Drama|Musical|Romance,1
4,Action|Fantasy|Horror|Thriller,1
...,...,...
20484,Animation|Documentary,1
20485,Drama,1
20486,(no genres listed),1
20487,Drama,1


In [20]:
genres_g = genres.groupby('genres').count().reset_index()
genres_g = genres_g.sort_values('count', ascending=False)
genres_g.head(50)

Unnamed: 0,genres,count
710,Drama,3021
688,Documentary,2593
562,Comedy,1908
0,(no genres listed),1424
585,Comedy|Drama,894
788,Horror,602
749,Drama|Romance,528
829,Thriller,500
640,Comedy|Romance,499
801,Horror|Thriller,427


In [21]:
# to get items in the list df = df[df['date'].isin(a)]
# to get the items not in the list df = df[~df['date'].isin(a)]

In [22]:
movie_id = movies['movieId'].values # get movie IDs that remain after cutting films before 2010
movie_id

array([ 73268.,  73319.,  73321., ..., 209151., 209157., 209163.])

In [None]:
ratings = ratings_full[ratings_full.movieId.isin(movie_id)] # filter out movies not in movie_Id
ratings.shape

We do not require the timestamp column from this set, and therefore it will be dropped.

In [None]:
links_full.shape

In [None]:
links  = links_full[links_full.movieId.isin(movie_id)]
links.shape

In [None]:
ratings.pop('timestamp')
ratings.head(5)

In [None]:
ratings.shape

In [None]:
min_movie_ratings = 250
filter_movies = ratings['movieId'].value_counts() > min_movie_ratings
filter_movies = filter_movies[filter_movies].index.tolist()

min_user_ratings = 250
filter_users = ratings['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

ratings = ratings[(ratings['movieId'].isin(filter_movies)) & (ratings['userId'].isin(filter_users))]
ratings.shape

For the purposes of testing, it would be good to get a few users who have rated many films and to test with their user ids.

In [None]:
grouped = ratings.groupby('userId').count().reset_index()
grouped = grouped.sort_values('rating', ascending=False)

In [None]:
grouped

##### Define a Ratings scale
This scale is determined by the lowest and highest rating possible. 
In this case the lowest rating is 1, while the highest is 5.

In [None]:
reader = Reader(rating_scale=(1,5)) # This just defines the rating scale
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)

### Build the Recommender Model

##### KNN with Means - Surprise

[KNN with Means](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans) has been chosen for the recommender, which is a basic collaborative filtering algorithm, taking into account the mean ratings of each user.

In [None]:
def build_recommender(user_based=False, sim_type='cosine'):
    sim_options = {
        "name": sim_type,
        "user_based": user_based
    }

    return KNNWithMeans(sim_options=sim_options)

##### Calculate the Similarity Matrix

Ignoring folds this builds the *Trainset* using [build_full_trainset()](https://surprise.readthedocs.io/en/stable/dataset.html#surprise.dataset.DatasetAutoFolds.build_full_trainset)

The Trainset is built using the data, but then contains more information about the data

In [None]:
trainset = data.build_full_trainset()
item_based_recommender = build_recommender()
item_based_recommender.fit(trainset)

### Evaluate the Model

Using [cross_validation()](https://surprise.readthedocs.io/en/stable/model_selection.html#cross-validation) from surprise, we can quickly evaluate the model using a few metrics. 

In [None]:
cross_validate(item_based_recommender, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### Inference

The main function to run the model and get inferences

In [None]:
ml_model = item_based_recommender

In [None]:
def get_r(user_id):
    # Select which system to use. Due to memory constraints, item based is the only viable option
    recommender_system = ml_model
    # N will represent how many items to recommend
    N = 2000

    # The setting to a set and back to list is a failsafe.
    rated_items = list(set(ratings.loc[ratings['userId'] == user_id]['movieId'].tolist()))
    ratings_list = movies['movieId'].values.tolist()
    reduced_ratings = ratings.loc[ratings['movieId'].isin(ratings_list)].copy()

    # Self explanitory name
    all_item_ids = list(set(reduced_ratings['movieId'].tolist()))

    # New_items just represents all the items not rated by the user
    new_items = [x for x in all_item_ids if x not in rated_items]

    # Estimate ratings for all unrated items
    predicted_ratings = {}
    for item_id in new_items:
        predicted_ratings[item_id] = recommender_system.predict(user_id, item_id).est
        pass

    # Get the item_ids for the top ratings
    recommended_ids = heapq.nlargest(N, predicted_ratings, key=predicted_ratings.get)
    recommended_ids = sorted(recommended_ids)

    # predicted_ratings
    recommended_df = movies.loc[movies['movieId'].isin(recommended_ids)].copy()
    #recommended_df.insert(1, 'pred_rating', np.zeros(len(recommended_ids)))
    recommended_df.insert(1, 'pred_rating', 0)

    # recommended_df = movies.copy()
    for idx,item_id in enumerate(recommended_ids):
        recommended_df.iloc[idx, recommended_df.columns.get_loc('pred_rating')] = int(predicted_ratings[item_id])
        pass
    return recommended_df.head(N).sort_values('pred_rating', ascending=False)

def cap_str(item):
    string = item
    return string.capitalize()

def reg_frame(f_list,items):
    s_ = ''
    for i in items:
        j = i.strip()
        j = cap_str(j)
        str_ = f'(?=.*{j})'
        s_ += str_
    s_
    f_list = f_list[f_list['genres'].str.contains(fr'^\b{s_}\b',regex=True)]
    return f_list

def set_up_ml(user_id,genre_list):
    film_list = get_r(user_id)
    items = genre_list.split(',')
    film_list = reg_frame(film_list,items)
    film_list.pop('date')
    return film_list

### Get a Recommendation Based on Genres
The final code that will be impletented in a cleaner fashion through the browser interface.

In [None]:
genre_list = 'comedy, horror'

In [None]:
user_1 = 123711 # user with a medium amount of ratings
user_2 = 15078 # user with the lowest number of ratings
user_3 = 72315 # user with a lot of ratings
recommended_list_1 = set_up_ml(user_1,genre_list) # generate a list of recommendations for each user
recommended_list_2 = set_up_ml(user_2,genre_list)
recommended_list_3 = set_up_ml(user_3,genre_list)

In [None]:
def get_final_recommendation(list_1,list_2,list_3): # combine all recommendations
    film_recommendation = pd.DataFrame()
    film_recommendation = pd.concat([list_1,list_2,list_3]) # concat lists
    film_recommendation = film_recommendation.drop_duplicates() # drop recommended duplicates of films
    film_recommendation = film_recommendation.sort_values('pred_rating',ascending=False) # sort by predicted rating
    film_recommendation.pop('pred_rating') # drop the rating column
    film_recommendation = film_recommendation.reset_index()
    film_recommendation.pop('index') # reset and pop the old index
    a = []
    for i in range(0,film_recommendation.shape[0]): # iterate through the dataframe and get the appropriate link for each movie
        link = links[links['movieId'] == film_recommendation.iloc[i]['movieId']]
        link = 'https://www.themoviedb.org/movie/' + str(int(link.iloc[0][2])) # append the link to the array
        a.append(link)
    film_recommendation['link'] = a # add the array to the dataframe
    film_recommendation.pop('movieId')
    return film_recommendation

In [None]:
final_recommendation = get_final_recommendation(recommended_list_1,recommended_list_2,recommended_list_3)
final_recommendation

## Save the Model

In [None]:
import pickle
filename = 'model/moviEeze_recommender_model_2010_up.sav'
pickle.dump(item_based_recommender, open(filename, 'wb'))

In [None]:
movies.to_csv('datasets/movies_2010.csv',index=False)
ratings.to_csv('datasets/ratings_2010.csv',index=False)
links.to_csv('datasets/links_2010.csv',index=False)