# Q1: how many movies released in each year?

In [10]:
import csv
from itertools import islice
from collections import Counter
from pprint import pprint

# You can do 
# f = open('imdb.csv')
# reader = csv.reader(f, delimimter='\t')

movie_counter = Counter()
with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        movie_counter[int(row[1])] += 1

for year, count in movie_counter.items():
    # print(year, count)
    pass


Actually if you use `Counter` and list magic, you can do it with the following one-liner. ;)

In [11]:
Counter(x.split('\t')[1] for x in islice(open('imdb.csv'), 1, None))

Counter({'2011': 13944, '2012': 13887, '2013': 13048, '2010': 12931, '2009': 12268, '2008': 11095, '2014': 10862, '2007': 10147, '2006': 10115, '2005': 9508, '2004': 8584, '2003': 7355, '2002': 6694, '2001': 6042, '2000': 5575, '1999': 5138, '1998': 4651, '2015': 4402, '1997': 4353, '1996': 3923, '1995': 3698, '1994': 3415, '1989': 3193, '1992': 3136, '1993': 3128, '1990': 3093, '1988': 3054, '1987': 3049, '1991': 2993, '1985': 2908, '1986': 2882, '1984': 2779, '1983': 2647, '1982': 2537, '1979': 2526, '1981': 2485, '1972': 2445, '1980': 2438, '1976': 2399, '1974': 2392, '1978': 2386, '1971': 2370, '1973': 2325, '1969': 2320, '1975': 2286, '1977': 2264, '1970': 2240, '1968': 2199, '1967': 2086, '1966': 2025, '1965': 1896, '1964': 1823, '1962': 1669, '1963': 1635, '1961': 1623, '1957': 1604, '1959': 1572, '1960': 1567, '1958': 1533, '1956': 1479, '1955': 1476, '1954': 1397, '1953': 1393, '1951': 1318, '1952': 1316, '1950': 1283, '1937': 1245, '1936': 1235, '1938': 1230, '1949': 1208, '1

# Q2: Average ratings and votes

In [12]:
import numpy as np

ratings = []
votes = []
with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        ratings.append(float(row[2]))
        votes.append(int(row[3]))

print("Average rating:", np.mean(ratings))
print("Average number of votes:", np.mean(votes))

Average rating: 6.29619534138
Average number of votes: 1691.2317746


If you're crazy...

In [13]:
list(map(np.mean, (zip(*[(float(x.split('\t')[2]), int(x.split('\t')[3])) for x in islice(open('imdb.csv'), 1, None)]))))

[6.2961953413777811, 1691.2317746021706]

# Q3: Top 10 movies

In [14]:
movie_rating = {}
movie_vote = {}

with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        movie_rating[row[0]] = float(row[2])
        movie_vote[row[0]] = int(row[3])
        
# a little bit more succinct way to sort by values with an anonymous function
pprint(sorted(movie_rating.items(), key=lambda x: x[1], reverse=True)[:10])
pprint(sorted(movie_vote.items(), key=lambda x: x[1], reverse=True)[:10])

# this is totally fine
#import operator
#sorted(movie_rating.items(), key=operator.itemgetter(1), reverse=True)

[('Adolfo Perez Esquivel: Rivers of Hope', 9.9),
 ('Girls Loving Girls', 9.8),
 ('High-Rise', 9.8),
 ('The Red Shirt Diaries', 9.8),
 ('Mari White Presents the Newsboys', 9.7),
 ('The Cafeteria', 9.6),
 ('A New Born', 9.6),
 ('Band of Brothers', 9.6),
 ('Breaking Bad', 9.5),
 ('TVF Pitchers', 9.5)]
[('The Shawshank Redemption', 1511933),
 ('The Dark Knight', 1487023),
 ('Inception', 1285905),
 ('Fight Club', 1189053),
 ('Pulp Fiction', 1177471),
 ('The Lord of the Rings: The Fellowship of the Ring', 1108361),
 ('Forrest Gump', 1090430),
 ('The Lord of the Rings: The Return of the King', 1083793),
 ('The Matrix', 1083405),
 ('The Godfather', 1036276)]


In [15]:
sorted([(x.split('\t')[0], float(x.split('\t')[2])) for x in islice(open('imdb.csv'), 1, None)], key=lambda x: x[1], reverse=True)[:10]

[('Adolfo Perez Esquivel: Rivers of Hope', 9.9),
 ('The Red Shirt Diaries', 9.8),
 ('Girls Loving Girls', 9.8),
 ('High-Rise', 9.8),
 ('Mari White Presents the Newsboys', 9.7),
 ('Band of Brothers', 9.6),
 ('A New Born', 9.6),
 ('The Cafeteria', 9.6),
 ('Breaking Bad', 9.5),
 ('Game of Thrones', 9.5)]

In [16]:
sorted([(x.split('\t')[0], int(x.split('\t')[3])) for x in islice(open('imdb.csv'), 1, None)], key=lambda x: x[1], reverse=True)[:10]

[('The Shawshank Redemption', 1511933),
 ('The Dark Knight', 1487023),
 ('Inception', 1285905),
 ('Fight Club', 1189053),
 ('Pulp Fiction', 1177471),
 ('The Lord of the Rings: The Fellowship of the Ring', 1108361),
 ('Forrest Gump', 1090430),
 ('The Lord of the Rings: The Return of the King', 1083793),
 ('The Matrix', 1083405),
 ('The Godfather', 1036276)]

How many movies in the top 10 rating list do you know? They have very high average rating but they may have a very small number of votes. For instance, the first one only has 9 ratings. (compare with The Shawshank Redemption with more than 1.5 million votes). 

In [17]:
movie_vote['Adolfo Perez Esquivel: Rivers of Hope']

9

One very simple way to mitigate this problem is using [Bayesian average](http://fulmicoton.com/posts/bayesian_rating/). It 'corrects' the average star rating based on prior belief. For instance, if we assume that the default rating is about 3.2 (`m`) and we have some confidence (`C`), then we can get more meaningful list. You can play with the two parameters. You'll notice that it's fairly robust. 

In [18]:
def bayes_avg(num_rating, avg_rating, C=5, m=3.2):
    return (C*m + num_rating*avg_rating)/(C + num_rating)

movie_bayes_rating = {}
for movie in movie_rating:
    movie_bayes_rating[movie] = bayes_avg(movie_vote[movie], movie_rating[movie])
    
sorted(movie_bayes_rating.items(), key=lambda x: x[1], reverse=True)[:20]

[('Band of Brothers', 9.599825268376852),
 ('Game of Thrones', 9.499962219706202),
 ('Breaking Bad', 9.499956854988932),
 ('Planet Earth', 9.499611135115117),
 ('TVF Pitchers', 9.484002031488066),
 ('The Wire', 9.399808900313774),
 ('Cosmos: A Spacetime Odyssey', 9.399215110390926),
 ('Hababam Sinifi', 9.39864391951006),
 ('The Civil War', 9.393290043290044),
 ('The Beatles Anthology', 9.391899660308336),
 ('Sarabhai vs Sarabhai', 9.391766268260293),
 ('The Shawshank Redemption', 9.299979827215138),
 ('Sherlock', 9.299924929174985),
 ('CM101MMXI Fundamentals', 9.299095277645943),
 ('Pink Floyd: P. U. L. S. E. Live at Earls Court', 9.290489554100406),
 ('WARx2', 9.28621157323689),
 ('Two Down', 9.26980198019802),
 ('The Godfather', 9.199971050323223),
 ('True Detective', 9.199888282719199),
 ('The Sopranos', 9.19981641382771)]

# Q4: Median ratings of movies in each decade 

In [24]:
from collections import defaultdict

dec_ratings = defaultdict(list)

with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        decade = row[1][:3] + '0s'
        rating = float(row[2])
        dec_ratings[decade].append(rating)
        
for decade in sorted(dec_ratings):
    print(decade, np.median(dec_ratings[decade]))

1870s 6.9
1880s 5.7
1890s 4.6
1900s 5.1
1910s 5.7
1920s 6.1
1930s 6.1
1940s 6.3
1950s 6.2
1960s 6.3
1970s 6.3
1980s 6.3
1990s 6.3
2000s 6.6
2010s 6.8


# Q5: 5 movies with highest ratings in each decade

In [31]:
dec_movie_ratings = defaultdict(set)

with open('imdb.csv') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in islice(reader, 1, None):
        decade = row[1][:3] + '0s'
        title = row[0]
        rating = float(row[2])
        dec_movie_ratings[decade].add( (title, rating) )
        
for decade in sorted(dec_movie_ratings):
    print('#', decade)
    for movie, rating in sorted(dec_movie_ratings[decade], key=lambda x: x[1], reverse=True)[:5]:
        print(rating, movie)


# 1870s
7.3 Sallie Gardner at a Gallop
6.5 Passage de Venus
# 1880s
7.7 Roundhay Garden Scene
7.2 Traffic Crossing Leeds Bridge
6.8 Leisurely Pedestrians, Open Topped Buses and Hansom Cabs with Trotting Horses
5.7 Accordion Player
5.7 Brighton Street Scene
# 1890s
8.3 Tossing a Nigger in a Blanket
7.8 Dancing Darkies
7.6 La lune à un mètre
7.6 Un homme de têtes
7.6 Yale Athletes Broad Jumping
# 1900s
8.4 Another Picture Showing Demonstration of a Pneumatic Shell Riveter
8.4 Battle Royal
8.4 The Laughing Nigger
8.3 Halloween
8.3 Your Dog Ate My Lunch Mum
# 1910s
8.9 Victorious Serbia
8.8 Unfaithful
8.8 Lincoln's Gettysburg Address
8.8 A Régiséggyüjtö
8.8 Dead Man's Shoes
# 1920s
8.9 Perekop
8.9 Hot Doggie
8.8 Balto's Race to Nome
8.7 Scrooge
8.6 A Briny Boob
# 1930s
9.3 Screen Snapshots Series 16, No. 8
9.2 Screen Snapshots Series 15, No. 10
9.2 Screen Snapshots Series 17, No. 5
9.1 Screen Snapshots Series 16, No. 1
9.1 Screen Snapshots Series 17, No. 9
# 1940s
8.9 Meet the Stars #7: Me