In [24]:
import random
import csv

class MovieRatings:
    pass
    #responsible for knowing a movie, and knowing how to handle rating math?
    def __init__(self, movie, rating_list):
        self.movie = movie
        self.rating_list = rating_list
    
    def avg(self):
        return sum([r.rating for r in self rating_list])/len(self.rating_list)
    
    

class Movie:
    def __init__(self, id, title, release_date, imdb_link):
        self.id = id
        self.title = title
        self.release_date = release_date
        self.imdb = imdb_link
    
    def __str__(self):
        return "{} {}\nreleased on: {}\nclick here for more info: {}".format(self.id, self.title, self.release_date, self.imdb)
    
    def __repr__(self):
        return "{} {}".format(self.id, self.title)
    
    

class Rating:
    def __init__(self, movie_id, user_id, rating, timestamp):
        self.movie_id = movie_id
        self.user_id = user_id
        self.rating = int(rating)
        self.timestamp = timestamp
    
    def __repr__(self):
        return "{} {} {}".format(self.movie_id, self.user_id, self.rating)

        
class User:
    @staticmethod
    def first_name():
        return random.choice(["Sam", "Nadia", "Bernadette", "Raj", "Ryan", "Josh", "Luis", "Will", "Dana", "Graham", "Eric", "Bryce", "Dan", "Nicole"])

    @staticmethod
    def middle_name():
        return random.choice(["David", "John", "Michael", "Mike", "Jim", "John", "Joe", "Nathan", "Luke", "Mark", "Gertrude", "Lois",])

    @staticmethod
    def last_name():
        return random.choice(["Smith", "Adams", "Jones", "Cooper", "Thomas", "Doe", "Dough", "Warren", "Walker"])

    
    def __init__(self, id, age, gender, occupation, zipcode):
        self.id = id
        self.name = User.first_name() + " " + User.middle_name() + " " + User.last_name()
        self.age = age
        self.gender = gender
        self.job = occupation
        self.zipcode = zipcode
    
    def __repr__(self):
        return self.name


IndentationError: expected an indented block (<ipython-input-24-fbb05d0e0c4a>, line 6)

In [2]:
#Jupyter notebook uses 
!echo item == movies
!head -2 data/ml-100k/u.item
!echo user == user
!head -2 data/ml-100k/u.user
!echo data == ratings
!head -2 data/ml-100k/u.data


item == movies
1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0
user == user
1|24|M|technician|85711
2|53|F|other|94043
data == ratings
196	242	3	881250949
186	302	3	891717742


In [17]:
movies_by_id = {}
users_by_id = {}
ratings_by_movie_id = {}
ratings_by_user_id = {}

def load_data():
    movie_filename = "data/ml-100k/u.item"
    user_filename = "data/ml-100k/u.user"
    rating_filename = "data/ml-100k/u.data"
    
    with open(movie_filename, "r", encoding="latin_1") as f:
        dict_reader = csv.DictReader(f, fieldnames=("id", "title", "release_date", "imdb_link"), delimiter="|")
        for row in dict_reader:
            del row[None]
            m = Movie(**row)
            movies_by_id[m.id] = m
    
    with open(user_filename, "r") as f:
        dict_reader = csv.DictReader(f, fieldnames=("id", "age", "gender", "occupation", "zipcode"), delimiter="|")
        for row in dict_reader:
            u = User(**row)
            users_by_id[u.id] = u

    with open(rating_filename, "r") as f:
        dict_reader = csv.DictReader(f, fieldnames=("user_id", "movie_id", "rating", "timestamp"), delimiter="\t")
        for row in dict_reader:
            r = Rating(**row)
            if r.movie_id in ratings_by_movie_id:
                ratings_by_movie_id[r.movie_id].append(r)
            else:
                ratings_by_movie_id[r.movie_id] = [r]
            
            ratings_by_user_id.setdefault(r.user_id, []).append(r)
    
#     return movies_by_id, users_by_id, ratings_by_user_id, ratings_by_movie_id
    

def average_rating_for_movie(movie):
    return average_rating_for_movie_id(movie.id)
    
def average_rating_for_movie_id(movie_id):
    return sum([r.rating for r in ratings_by_movie_id[movie_id]]) / len(ratings_by_movie_id[movie_id])

def uf_avg_rating_for_movie(movie_id):
    print("{}({}) {}".format(average_rating_for_movie(movie_id), len(ratings_by_movie_id[movie_id]), movies_by_id[movie_id].title))
    
def top_movies_by_rating(how_many_to_show, min_ratings):
    #sort by this: average_rating_for_movie(movie_id)
    return sorted([(m, average_rating_for_movie(m)) for m in movies_by_id.values() if len(ratings_by_movie_id[m.id]) >= min_ratings], key=lambda x: x[1], reverse=True)[:how_many_to_show]



def top_movies_by_rating_for_user(how_many_to_show, min_ratings, user):
    #sort by this: average_rating_for_movie(movie_id)
    unseen_movies = []
    for movie in movies_by_id.values():
        if movie.id not in [r.movie_id for r in ratings_by_user_id[user.id]]:
            if len(ratings_by_movie_id[movie.id]) >= min_ratings:
                unseen_movies.append((movie, average_rating_for_movie(movie)))
    unseen_movies.sort(key=lambda t: t[1], reverse=True)
    return unseen_movies[:how_many_to_show]


In [18]:
load_data()

In [19]:
top_movies_by_rating(20, 10)

[(408 Close Shave, A (1995), 4.491071428571429),
 (318 Schindler's List (1993), 4.466442953020135),
 (169 Wrong Trousers, The (1993), 4.466101694915254),
 (483 Casablanca (1942), 4.45679012345679),
 (114 Wallace & Gromit: The Best of Aardman Animation (1996),
  4.447761194029851),
 (64 Shawshank Redemption, The (1994), 4.445229681978798),
 (603 Rear Window (1954), 4.3875598086124405),
 (12 Usual Suspects, The (1995), 4.385767790262173),
 (50 Star Wars (1977), 4.3584905660377355),
 (178 12 Angry Men (1957), 4.344),
 (513 Third Man, The (1949), 4.333333333333333),
 (134 Citizen Kane (1941), 4.292929292929293),
 (963 Some Folks Call It a Sling Blade (1993), 4.2926829268292686),
 (427 To Kill a Mockingbird (1962), 4.292237442922374),
 (357 One Flew Over the Cuckoo's Nest (1975), 4.291666666666667),
 (98 Silence of the Lambs, The (1991), 4.28974358974359),
 (480 North by Northwest (1959), 4.284916201117318),
 (127 Godfather, The (1972), 4.283292978208232),
 (285 Secrets & Lies (1996), 4.265

In [20]:
top_movies_by_rating_for_user(20, 10, users_by_id['1'])

[(408 Close Shave, A (1995), 4.491071428571429),
 (318 Schindler's List (1993), 4.466442953020135),
 (483 Casablanca (1942), 4.45679012345679),
 (603 Rear Window (1954), 4.3875598086124405),
 (513 Third Man, The (1949), 4.333333333333333),
 (963 Some Folks Call It a Sling Blade (1993), 4.2926829268292686),
 (427 To Kill a Mockingbird (1962), 4.292237442922374),
 (357 One Flew Over the Cuckoo's Nest (1975), 4.291666666666667),
 (480 North by Northwest (1959), 4.284916201117318),
 (285 Secrets & Lies (1996), 4.265432098765432),
 (657 Manchurian Candidate, The (1962), 4.259541984732825),
 (474 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963),
  4.252577319587629),
 (479 Vertigo (1958), 4.251396648044692),
 (313 Titanic (1997), 4.2457142857142856),
 (511 Lawrence of Arabia (1962), 4.23121387283237),
 (641 Paths of Glory (1957), 4.212121212121212),
 (484 Maltese Falcon, The (1941), 4.2101449275362315),
 (515 Boot, Das (1981), 4.203980099502488),
 (488 Sunset Blvd.

In [22]:
def compare_users(u1, u2):
    
    rl1 = ratings_by_user_id[u1.id]
    rl2 = ratings_by_user_id[u2.id]
    
    print(rl1)
    print(rl2)
    

In [23]:
compare_users(users_by_id['1'], users_by_id['2'])

[61 1 4, 189 1 3, 33 1 4, 160 1 4, 20 1 4, 202 1 5, 171 1 5, 265 1 4, 155 1 2, 117 1 3, 47 1 4, 222 1 4, 253 1 5, 113 1 5, 227 1 4, 17 1 3, 90 1 4, 64 1 5, 92 1 3, 228 1 5, 266 1 1, 121 1 4, 114 1 5, 132 1 4, 74 1 1, 134 1 4, 98 1 4, 186 1 4, 221 1 5, 84 1 4, 31 1 3, 70 1 3, 60 1 5, 177 1 5, 27 1 2, 260 1 1, 145 1 2, 174 1 5, 159 1 3, 82 1 5, 56 1 4, 272 1 3, 80 1 4, 229 1 4, 140 1 1, 225 1 2, 235 1 5, 120 1 1, 125 1 3, 215 1 3, 6 1 5, 104 1 1, 49 1 3, 206 1 4, 76 1 4, 72 1 4, 185 1 4, 96 1 5, 213 1 2, 233 1 2, 258 1 5, 81 1 5, 78 1 1, 212 1 4, 143 1 1, 151 1 4, 51 1 4, 175 1 5, 107 1 4, 218 1 3, 209 1 4, 259 1 1, 108 1 5, 262 1 3, 12 1 5, 14 1 5, 97 1 3, 44 1 5, 53 1 3, 163 1 4, 210 1 4, 184 1 4, 157 1 4, 201 1 3, 150 1 5, 183 1 5, 248 1 4, 208 1 5, 128 1 4, 242 1 5, 148 1 2, 112 1 1, 193 1 4, 264 1 2, 219 1 1, 232 1 3, 236 1 4, 252 1 2, 200 1 3, 180 1 3, 250 1 4, 85 1 3, 91 1 5, 10 1 3, 254 1 1, 129 1 5, 241 1 4, 130 1 3, 255 1 2, 103 1 1, 118 1 3, 54 1 3, 267 1 4, 24 1 3, 86 1 5, 19