## This notebook can help you debug your code

In [31]:
%reload_ext autoreload
%autoreload 2
from recommendation_system import User, Movie, RecommendationSystem

recommender = RecommendationSystem()
recommender.load_data('tiny_ratings.csv')

In [32]:
recommender.users[1].ratings

{11: 5, 12: 3, 13: 1}

In [33]:
recommender.users[2].ratings

{11: 4, 13: 2}

In [34]:
sim = recommender.calculate_top_n_similar_users(1)
sim

[(4, 1.0), (5, 1.0), (2, 0.9647638212377322), (3, 0.5738045840530311)]

In [35]:
sim = recommender.calculate_top_n_similar_users(2)
sim

[(4, 1.0), (5, 1.0), (1, 0.9647638212377322), (3, 0.6139406135149205)]

In [27]:
recommender.recommend_movies(1)

[]

In [28]:
recommender.recommend_movies(5)

[(12, 3.5), (13, 2.6666666666666665)]

In [30]:
recommender.recommend_movies(4)

[(12, 3.5), (13, 2.6666666666666665)]

## Testing on a real dataset

In [44]:
# here are the movies
!head ~/data/ml-latest-small/movies.csv

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action


In [36]:
# ratings have the wrong headers
!head ~/data/ml-latest-small/ratings.csv

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041


In [37]:
import pandas as pd
df = pd.read_csv("~/data/ml-latest-small/ratings.csv")
df.shape

(100836, 4)

In [38]:
df =  df.rename(columns={"userId": "user_id", "movieId": "movie_id"})
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [39]:
df = df.iloc[:,:3]
df.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [41]:
df.to_csv("~/data/ml-latest-small/my_ratings.csv")

In [42]:
recommender = RecommendationSystem()
recommender.load_data("~/data/ml-latest-small/my_ratings.csv")

In [46]:
# Let's get recommendations for user 1
rec = recommender.recommend_movies(1)
rec

[(3996.0, 5.0), (5349.0, 5.0), (5378.0, 5.0), (5952.0, 5.0), (8636.0, 5.0)]

In [47]:
# Let's find out if these group of movies make sense
df_movies = pd.read_csv("~/data/ml-latest-small/movies.csv")
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [48]:
movies = {}
for _, row in df_movies.iterrows():
    movies[row["movieId"]] = (row["title"], row["genres"])

In [49]:
for id, _ in rec:
    print(movies[int(id)])

('Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)', 'Action|Drama|Romance')
('Spider-Man (2002)', 'Action|Adventure|Sci-Fi|Thriller')
('Star Wars: Episode II - Attack of the Clones (2002)', 'Action|Adventure|Sci-Fi|IMAX')
('Lord of the Rings: The Two Towers, The (2002)', 'Adventure|Fantasy')
('Spider-Man 2 (2004)', 'Action|Adventure|Sci-Fi|IMAX')


## Extra assignemnet for fun
Given two movies, it would be great to compute a similarity based on the rating data. Can you come up with a movie similarity metric? Can you write the code?