# Q1: how many movies released in each year?

In [None]:
import csv
from itertools import islice
from collections import Counter

# You can do 
# f = open('imdb.csv')
# reader = csv.reader(f, delimimter='\t')

movie_counter = Counter()
with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        movie_counter[int(row[1])] += 1

for year, count in movie_counter.items():
    print(year, count)


Actually if you use `Counter` and list magic, you can do it with the following one-liner. ;)

In [None]:
Counter(x.split('\t')[1] for x in islice(open('imdb.csv'), 1, None))

# Q2: Average ratings and votes

In [16]:
import numpy as np

ratings = []
votes = []
with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        ratings.append(float(row[2]))
        votes.append(int(row[3]))

print("Average rating:", np.mean(ratings))
print("Average number of votes:", np.mean(votes))

Average rating: 6.29619534138
Average number of votes: 1691.2317746


If you're crazy...

In [31]:
list(map(np.mean, (zip(*[(float(x.split('\t')[2]), int(x.split('\t')[3])) for x in islice(open('imdb.csv'), 1, None)]))))

[6.2961953413777811, 1691.2317746021706]

# Q3: Top 10 movies

In [39]:
movie_rating = {}
movie_vote = {}

with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        movie_rating[row[0]] = float(row[2])
        movie_vote[row[0]] = int(row[3])
        
# a little bit more succinct way to sort by values with an anonymous function
pprint(sorted(movie_rating.items(), key=lambda x: x[1], reverse=True)[:10])
pprint(sorted(movie_vote.items(), key=lambda x: x[1], reverse=True)[:10])

# this is totally fine
#import operator
#sorted(movie_rating.items(), key=operator.itemgetter(1), reverse=True)

[('Adolfo Perez Esquivel: Rivers of Hope', 9.9),
 ('Girls Loving Girls', 9.8),
 ('High-Rise', 9.8),
 ('The Red Shirt Diaries', 9.8),
 ('Mari White Presents the Newsboys', 9.7),
 ('A New Born', 9.6),
 ('The Cafeteria', 9.6),
 ('Band of Brothers', 9.6),
 ('TVF Pitchers', 9.5),
 ('Planet Earth', 9.5)]
[('The Shawshank Redemption', 1511933),
 ('The Dark Knight', 1487023),
 ('Inception', 1285905),
 ('Fight Club', 1189053),
 ('Pulp Fiction', 1177471),
 ('The Lord of the Rings: The Fellowship of the Ring', 1108361),
 ('Forrest Gump', 1090430),
 ('The Lord of the Rings: The Return of the King', 1083793),
 ('The Matrix', 1083405),
 ('The Godfather', 1036276)]


In [42]:
sorted([(x.split('\t')[0], float(x.split('\t')[2])) for x in islice(open('imdb.csv'), 1, None)], key=lambda x: x[1], reverse=True)[:10]

[('Adolfo Perez Esquivel: Rivers of Hope', 9.9),
 ('The Red Shirt Diaries', 9.8),
 ('Girls Loving Girls', 9.8),
 ('High-Rise', 9.8),
 ('Mari White Presents the Newsboys', 9.7),
 ('Band of Brothers', 9.6),
 ('A New Born', 9.6),
 ('The Cafeteria', 9.6),
 ('Breaking Bad', 9.5),
 ('Game of Thrones', 9.5)]

In [43]:
sorted([(x.split('\t')[0], int(x.split('\t')[3])) for x in islice(open('imdb.csv'), 1, None)], key=lambda x: x[1], reverse=True)[:10]

[('The Shawshank Redemption', 1511933),
 ('The Dark Knight', 1487023),
 ('Inception', 1285905),
 ('Fight Club', 1189053),
 ('Pulp Fiction', 1177471),
 ('The Lord of the Rings: The Fellowship of the Ring', 1108361),
 ('Forrest Gump', 1090430),
 ('The Lord of the Rings: The Return of the King', 1083793),
 ('The Matrix', 1083405),
 ('The Godfather', 1036276)]

How many movies in the top 10 rating list do you know? They have very high average rating but they may have a very small number of votes. For instance, the first one only has 9 ratings. (compare with The Shawshank Redemption with more than 1.5 million votes). 

In [52]:
movie_vote['Adolfo Perez Esquivel: Rivers of Hope']

9

One very simple way to mitigate this problem is using [Bayesian average](http://fulmicoton.com/posts/bayesian_rating/). It 'corrects' the average star rating based on prior belief. For instance, if we assume that the default rating is about 3.2 (`m`) and we have some confidence (`C`), then we can get more meaningful list. You can play with the two parameters. You'll notice that it's fairly robust. 

In [57]:
def bayes_avg(num_rating, avg_rating, C=5, m=3.2):
    return (C*m + num_rating*avg_rating)/(C + num_rating)

movie_bayes_rating = {}
for movie in movie_rating:
    movie_bayes_rating[movie] = bayes_avg(movie_vote[movie], movie_rating[movie])
    
sorted(movie_bayes_rating.items(), key=lambda x: x[1], reverse=True)[:20]

[('Band of Brothers', 9.599825268376852),
 ('Game of Thrones', 9.499962219706202),
 ('Breaking Bad', 9.499956854988932),
 ('Planet Earth', 9.499611135115117),
 ('TVF Pitchers', 9.484002031488066),
 ('The Wire', 9.399808900313774),
 ('Cosmos: A Spacetime Odyssey', 9.399215110390926),
 ('Hababam Sinifi', 9.39864391951006),
 ('The Civil War', 9.393290043290044),
 ('The Beatles Anthology', 9.391899660308336),
 ('Sarabhai vs Sarabhai', 9.391766268260293),
 ('The Shawshank Redemption', 9.299979827215138),
 ('Sherlock', 9.299924929174985),
 ('CM101MMXI Fundamentals', 9.299095277645943),
 ('Pink Floyd: P. U. L. S. E. Live at Earls Court', 9.290489554100406),
 ('WARx2', 9.28621157323689),
 ('Two Down', 9.26980198019802),
 ('The Godfather', 9.199971050323223),
 ('True Detective', 9.199888282719199),
 ('The Sopranos', 9.19981641382771)]