# Recommender systems - practical exercise 1
non-personalized recommendations

## program

In [252]:
import pandas as pd
from functools import lru_cache

In [270]:
def read_dat_files():
    R = pd.read_csv("/ml-1m/ratings.dat",sep="::",engine="python",names=["userID","movieID","rating","timestamp"])
    del R["timestamp"]
    I = pd.read_csv("/ml-1m/movies.dat",sep="::",engine="python",names=["movieID","title","genre"])
    U = pd.read_csv("/ml-1m/users.dat",sep="::",engine="python")
    return R, I, U 

In [271]:
R, I, U = read_dat_files()
movies

Unnamed: 0,movieID,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [255]:
@lru_cache
def calculate_simple_association(movie_id_1,movie_id_2):
    movie_1 = R[R.movieID==movie_id_1] # X
    if len(movie_1) == 0:
        return 0
    
    movie_2 = R[R.movieID==movie_id_2]
    movie_1_2 = pd.merge(movie_1,movie_2,how="inner",on="userID")
    
    return len(movie_1_2)/len(movie_1)

In [296]:
@lru_cache
def calculate_advanced_association(movie_id_1,movie_id_2):    
    not_x_and_y = 0
    not_x = 0
    for user, data in R.groupby("userID"):
        movies = set(data.movieID)
        if (movie_id_1 not in movies) and (movie_id_2 in movies):
            not_x_and_y += 1
        if (movie_id_1 not in movies):
            not_x += 1
    if not_x_and_y == 0:
        return 0
    return calculate_simple_association(movie_id_1,movie_id_2)*(not_x/not_x_and_y)

In [297]:
calculate_simple_association(661,1091)

0.15047619047619049

In [299]:
calculate_advanced_association(661,1091)

2.8227081308713964

In [305]:
def highest_simple(movie_id,n):
    movies = set(R.movieID)
    movies.remove(movie_id)
    retval = []
    for i in movies:
        retval.append((i,calculate_simple_association(movie_id,i),))
    retval = sorted(retval,key=lambda x:(-x[1],-x[0]))[:n]
    return pd.merge(pd.DataFrame(retval, columns={"movieID","simple"}),I, how="inner",on="movieID")

In [306]:
highest_simple(661,10)

Unnamed: 0,movieID,simple,title,genre
0,480,0.75619,Jurassic Park (1993),Action|Adventure|Sci-Fi
1,1580,0.750476,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
2,1196,0.731429,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1,0.729524,Toy Story (1995),Animation|Children's|Comedy
4,260,0.725714,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
5,2858,0.71619,American Beauty (1999),Comedy|Drama
6,589,0.710476,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
7,1210,0.693333,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
8,2571,0.681905,"Matrix, The (1999)",Action|Sci-Fi|Thriller
9,1097,0.68,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi


In [307]:
def highest_advanced(movie_id,n):
    movies = set(R.movieID)
    movies.remove(movie_id)
    retval = []
    for i in movies:
        retval.append((i,calculate_advanced_association(movie_id,i),))
    retval = sorted(retval,key=lambda x:(-x[1],-x[0]))[:n]
    return pd.merge(pd.DataFrame(retval, columns={"movieID","simple"}),I, how="inner",on="movieID")

In [292]:
highest_advanced(661,10)

ZeroDivisionError: division by zero

In [289]:
def get_most_rated(n):
    return pd.merge(R.movieID.value_counts().head(n).reset_index().rename(columns={"index":"movieID","movieID":"freq"}),I,how="inner",on="movieID")

get_most_rated(10)

Unnamed: 0,movieID,freq,title,genre
0,2858,3428,American Beauty (1999),Comedy|Drama
1,260,2991,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2990,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1210,2883,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
4,480,2672,Jurassic Park (1993),Action|Adventure|Sci-Fi
5,2028,2653,Saving Private Ryan (1998),Action|Drama|War
6,589,2649,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
7,2571,2590,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8,1270,2583,Back to the Future (1985),Comedy|Sci-Fi
9,593,2578,"Silence of the Lambs, The (1991)",Drama|Thriller


In [288]:
def get_most_rated_with_least_rating(n,least=4):
    return pd.merge(R[R.rating >= least].movieID.value_counts().head(n).reset_index().rename(columns={"index":"movieID","movieID":"freq"}),I,how="inner",on="movieID")

get_most_rated_with_least_rating(10,4)

Unnamed: 0,movieID,freq,title,genre
0,2858,2853,American Beauty (1999),Comedy|Drama
1,260,2622,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2510,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1198,2260,Raiders of the Lost Ark (1981),Action|Adventure
4,2028,2260,Saving Private Ryan (1998),Action|Drama|War
5,593,2252,"Silence of the Lambs, The (1991)",Drama|Thriller
6,2571,2171,"Matrix, The (1999)",Action|Sci-Fi|Thriller
7,2762,2163,"Sixth Sense, The (1999)",Thriller
8,1210,2127,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
9,608,2074,Fargo (1996),Crime|Drama|Thriller


# Analysis

Consider the movie with ID 1. What is the value of the simple product association for the
movie with ID 1064?

In [228]:
calculate_simple_association(1,1064)

0.08281174771304767

Consider the movie with ID 1. What is the value of the advanced product association for the
movie with ID 1064?

In [245]:
calculate_advanced_association(1,1064)

8.869809626670484

Explain the difference between the values of the simple and advanced product association
(question 1 and 2). How do we have to interpret these numbers?

Answer:

- Simple association is an asymmetrical relationship that expresses the percentage of people who bought X, who also bought Y. X is something that occurred, what is now the probability that Y occurs?

- advanced product association calculates if X makes Y more likely than other products. 

Consider the movie with ID 1. What is the value of the simple product association for the
movie with ID 2858?

In [230]:
calculate_simple_association(1,2858)

0.6783822821376986

What are the movie titles and genres of movies with ID 1, 1064, and 2858?

In [322]:
I[(I.movieID == 1) | (I.movieID == 1064)| (I.movieID == 2858)| (I.movieID == 3941) ]

Unnamed: 0,movieID,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1050,1064,Aladdin and the King of Thieves (1996),Animation|Children's|Comedy
2789,2858,American Beauty (1999),Comedy|Drama
3871,3941,Sorority House Massacre (1986),Horror


Compare the results of question 1 and 4. Which movie has the highest simple association
value (1064 or 2858) with movie ID 1 and explain why?

- answer: The movie with id 2858, American Beauty (1999), has a higher association. This is because the term "X and Y" is higher. This means that if people 

Consider the movie with ID 1. What is the value of the advanced product association for the
movie with ID 2858?

In [273]:
calculate_advanced_association(1,2858)

1.331564628089004

Compare the results of question 2 and 7. Which movie has the highest advanced association
value (1064 or 2858) with movie ID 1 and explain why?

Calculate the top 10 most frequently rated movies. Provide the movie ID, number of users
who rated the movie, and title for each.

In [320]:
get_most_rated(10)

Unnamed: 0,movieID,freq,title,genre
0,2858,3428,American Beauty (1999),Comedy|Drama
1,260,2991,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2990,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1210,2883,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
4,480,2672,Jurassic Park (1993),Action|Adventure|Sci-Fi
5,2028,2653,Saving Private Ryan (1998),Action|Drama|War
6,589,2649,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
7,2571,2590,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8,1270,2583,Back to the Future (1985),Comedy|Sci-Fi
9,593,2578,"Silence of the Lambs, The (1991)",Drama|Thriller


Now you have to find the movies that can be associated to the movie with id 3941. Calculate
the Top 5 movies with the highest simple association value. Provide the movie ID,
association value, and title for each.

In [308]:
highest_simple(3941,5)

Unnamed: 0,movieID,simple,title,genre
0,110,0.954545,Braveheart (1995),Action|Drama|War
1,480,0.909091,Jurassic Park (1993),Action|Adventure|Sci-Fi
2,589,0.863636,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
3,1544,0.818182,"Lost World: Jurassic Park, The (1997)",Action|Adventure|Sci-Fi|Thriller
4,260,0.818182,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi


For the same movie as in question 10 (id 3941), calculate the Top 5 movies with the highest
advanced association value. Provide the movie ID, association value, and title for each.

In [309]:
highest_advanced(3941,5)

Unnamed: 0,movieID,simple,title,genre
0,3940,547.090909,"Slumber Party Massacre III, The (1990)",Horror
1,3938,465.027273,"Slumber Party Massacre, The (1982)",Horror
2,3939,437.672727,"Slumber Party Massacre II, The (1987)",Horror
3,3942,422.752066,Sorority House Massacre II (1990),Horror
4,3777,156.311688,Nekromantik (1987),Comedy|Horror


Compare the resulting lists of question 9 and 10. What do you witness and how can you
explain this?

Compare the resulting lists of question 10 and 11. Which one is the best (=most accurate)
according to you, and why?

Recalculate the Top 10 most frequently rated movies. But use only ratings of at least 4 stars.
Provide the movie ID, number of users who rated the movie, and title for each.

In [321]:
get_most_rated_with_least_rating(10,4)

Unnamed: 0,movieID,freq,title,genre
0,2858,2853,American Beauty (1999),Comedy|Drama
1,260,2622,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2510,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1198,2260,Raiders of the Lost Ark (1981),Action|Adventure
4,2028,2260,Saving Private Ryan (1998),Action|Drama|War
5,593,2252,"Silence of the Lambs, The (1991)",Drama|Thriller
6,2571,2171,"Matrix, The (1999)",Action|Sci-Fi|Thriller
7,2762,2163,"Sixth Sense, The (1999)",Thriller
8,1210,2127,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
9,608,2074,Fargo (1996),Crime|Drama|Thriller


Compare the resulting lists of question 9 and 14. What do you witness and how can you
explain this?