In [None]:
import pandas as pd

# Housekeeping: find out what is the current directory and adjust the path to the data file
# import os
# cwd = os.getcwd()
# print ('working_directory = ' + cwd)

# load data into the pandas dataframe (df)
df = pd.read_csv("imdb_100.csv")

# get sense of a data in the df by extracting 5 rows
df.head()

In [None]:
# API key helpers

def get_api_key():
    """
    This function reads the first line in the config.txt file and returns as an API key.
    Since it is a bad software engineering practice to expose the key in the code, the key is stored in the file
    """
    api_key=''
    with open('config.txt') as f:
        api_key = f.readline()
    if len(api_key) == 0:
        raise Exception("please configure OMDB API key in the config.txt file")    
    return api_key

api_key=get_api_key()

In [None]:
# helper functions and objects to represent movies and convert the OMDB data

def extract_rt_rating_percent(movie, ratings):
    """
    Takes a list of Rating objects {"Source": ..., "Value": ...} from the 'Ratings' porition of the OMDB result,
    Finds the rating that matches to the 'Roten Tomatoes' as a source (the comparison is case-insensitive)
    Assumes format '<float>%' (% sign is optional), if the <float> cannot be parsed, an error message is printed and None is returned
    """
    for rating in ratings:
        # print ('Source = {}, Value = {}'.format(rating['Source'], rating['Value']))
        result = None
        if rating['Source'].lower() == 'Rotten Tomatoes'.lower():
            try:
                return float(rating['Value'].replace('%', ''))
            except:
                print ('Unable to extract Roten Tomatoes rating for movie ' + str(movie))
    return result

def extract_imdb_rating_percent(movie, ratings):
    """
    Takes a list of Rating objects {"Source": ..., "Value": ...} from the 'Ratings' porition of the OMDB result,
    Finds the rating that matches to the 'Internet Movie Database' as a source (the comparison of the Source is case-insensitive)
    Calculates the percentage of the rating, i.e 8.9/10 = 89
    Assumes format '<int>/<int>' if it cannot be parsed, an error message is printed and None is returned
    """
    for rating in ratings:
        # print ('Source = {}, Value = {}'.format(rating['Source'], rating['Value']))
        result = None
        if rating['Source'].lower() == 'Internet Movie Database'.lower():
            try:
                parts = rating['Value'].split('/')
                if len(parts) != 2:
                    print ('Cannot parse IMDB rating [{}], expected "<int>/<int>"'.format(rating['Value']))
                else:
                    nom = float(parts[0]) * 100
                    denom = float(parts[1])
                    return nom / denom
            except:
                print ('Unable to extract Roten Tomatoes rating for movie ' + str(movie))
    return result


def value_or_default(value, default):
    return default if value is None else value

class Movie:
    """
    A movie represents a Movie object that is constructed from the OMDB JSON
    """
    def __init__(self, *args, **kwargs):
        # Title
        if 'Title' in kwargs:
            self.title = kwargs['Title']
        else:
            print ('Unable to extract title for movie ' + str(self))
            self.title = None
            return
        # Year
        try:
            self.year = int(kwargs['Year'])
        except:
            print ('Unable to extract year for movie ' + str(self))
            self.year = None
            return
        # Genre            
        if 'Genre' in kwargs:
            self.genre = kwargs['Genre']
        else:
            print ('Unable to extract genre for movie ' + str(self))
            self.genre = None
            return
        # Ratings
        if 'Ratings' in kwargs:
            self.rt_rating_percent = extract_rt_rating_percent(self, kwargs['Ratings'])
            self.imdb_rating_percent = extract_imdb_rating_percent(self, kwargs['Ratings'])
        else:
            print ('Unable to extract Ratings for movie ' + str(self))
            return
    
    def __repr__(self):
        return 'Movie[title={}, year={}, genre={}, rt_rating_percent={}]'.format(
            value_or_default(self.title, None) if hasattr(self, 'title') else None,
            value_or_default(self.year, None) if hasattr(self, 'year') else None,
            value_or_default(self.genre, None) if hasattr(self, 'genre') else None,
            value_or_default(self.rt_rating_percent, None) if hasattr(self, 'rt_rating_percent') else None,
            value_or_default(self.imdb_rating_percent, None) if hasattr(self, 'imdb_rating_percent') else None
        )

In [None]:
# debugging slot, please ignore

json_example = """{
  "Title": "Pulp Fiction",
  "Year": "1994",
  "Rated": "R",
  "Released": "14 Oct 1994",
  "Runtime": "154 min",
  "Genre": "Crime, Drama",
  "Director": "Quentin Tarantino",
  "Writer": "Quentin Tarantino (stories), Roger Avary (stories), Quentin Tarantino",
  "Actors": "Tim Roth, Amanda Plummer, Laura Lovelace, John Travolta",
  "Plot": "The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.",
  "Language": "English, Spanish, French",
  "Country": "USA",
  "Awards": "Won 1 Oscar. Another 69 wins & 75 nominations.",
  "Poster": "https://m.media-amazon.com/images/M/MV5BNGNhMDIzZTUtNTBlZi00MTRlLWFjM2ItYzViMjE3YzI5MjljXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_SX300.jpg",
  "Ratings": [
    {
      "Source": "Internet Movie Database",
      "Value": "8.9/10"
    },
    {
      "Source": "Rotten tomatoes",
      "Value": "92%"
    },
    {
      "Source": "Metacritic",
      "Value": "94/100"
    }
  ],
  "Metascore": "94",
  "imdbRating": "8.9",
  "imdbVotes": "1,831,004",
  "imdbID": "tt0110912",
  "Type": "movie",
  "DVD": "21 Apr 2016",
  "BoxOffice": "$107,928,762",
  "Production": "Miramax Films*, A Band Apart, Jersey Films",
  "Website": "N/A",
  "Response": "True"
}"""

json_example2="""
{
  "Title": "The Silence of the Lambs",
  "Year": "1991",
  "Rated": "R",
  "Released": "14 Feb 1991",
  "Runtime": "118 min",
  "Genre": "Crime, Drama, Thriller",
  "Director": "Jonathan Demme",
  "Writer": "Thomas Harris (based on the novel by), Ted Tally (screenplay by)",
  "Actors": "Jodie Foster, Lawrence A. Bonney, Kasi Lemmons, Lawrence T. Wrentz",
  "Plot": "A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.",
  "Language": "English, Latin",
  "Country": "USA",
  "Awards": "Won 5 Oscars. Another 63 wins & 51 nominations.",
  "Poster": "https://m.media-amazon.com/images/M/MV5BNjNhZTk0ZmEtNjJhMi00YzFlLWE1MmEtYzM1M2ZmMGMwMTU4XkEyXkFqcGdeQXVyNjU0OTQ0OTY@._V1_SX300.jpg",
  "Ratings": [
    {
      "Source": "Internet Movie Database",
      "Value": "8.6/10"
    },
    {
      "Source": "Rotten Tomatoes",
      "Value": "96%"
    },
    {
      "Source": "Metacritic",
      "Value": "85/100"
    }
  ],
  "Metascore": "85",
  "imdbRating": "8.6",
  "imdbVotes": "1,274,200",
  "imdbID": "tt0102926",
  "Type": "movie",
  "DVD": "27 Aug 2015",
  "BoxOffice": "$130,742,922",
  "Production": "Orion Pictures",
  "Website": "N/A",
  "Response": "True"
}
"""

import json
j = json.loads(json_example2)
movie = Movie(**j)

from pprint import pprint
pprint(vars(movie))

In [None]:
# Your script should pull the Rotten Tomato score for each of the movies in the top 100 IMDB movies and join it onto your Pandas dataframe. You may need to clean up the data!

import urllib
import requests
import json

omdb_url = 'http://www.omdbapi.com/?apikey={}&t={}&y={}&r=json'

"""
Loop through the rows of the dataframe and issue a request against the OMDB API
The result json is convered into an object which are then converted into another padas dataframe
"""
movies = []
for index, row in df.iterrows():
    ready_url = omdb_url.format(api_key, urllib.parse.quote(row.title), row.year)
    receive = requests.get(ready_url)
    received_json = json.loads(receive.content)
    movie = Movie(**received_json)
    if movie.title is not None and movie.year is not None and movie.rt_rating_percent is not None and movie.imdb_rating_percent is not None:
        movies.append(movie)
        print('PROCESSED movie [{}] through url [{}]'.format(index, ready_url))
    else:
        print('IGNORED movie [{}] through url [{}]'.format(index, ready_url))

In [None]:
# convert the list of the collected movies into the panda's dataframe

data = list(map(lambda movie: [movie.title, movie.genre, movie.year, movie.rt_rating_percent, movie.imdb_rating_percent], movies))

omdb_df = pd.DataFrame(data, columns = ['title', 'genre', 'year', 'rt_rating_percent', 'imdb_rating_percent']) 

omdb_df.head(12)

In [None]:
# Your script should pull the Rotten Tomato score for each of the movies in the top 100 IMDB movies and join it onto your Pandas dataframe.
# You may need to clean up the data!

# Note: the data has been cleaned up at the time when loaded from the OMDB API

# merge original dataframe with the one obtained from the omdb by using the 'title' and the 'year' columns
# note: this is the inner join, i.e. the rows will not be included if there is no match

df_merged = pd.merge(df, omdb_df, on=['title', 'year'])

df_merged.head()

In [None]:
# What are the top five R-Rated movies?

# top 5 TomatoRoten-Rated movies is found by using 'nlargest' function on the merged dataframe

df_merged.nlargest(5, 'rt_rating_percent')

In [None]:
# What is the average Rotten Tomato score for the top 100 IMDB films

# mean of a column is calculated by using the 'mean' on the column of interest ('rt_rating_percent' in our case)

df_merged.rt_rating_percent.mean()

In [None]:
# What is the Five Number Summary like for top rated films as per IMDB? Is it skewed?

# The 'Five Number Summary' (and a bit more stats) are extrated by the 'describe()' function
# It is skewed towards the higher rating due to the fact that the dataset is based on top 100 movies from IMDB

df_merged.rt_rating_percent.describe()

In [None]:
# Create a column that is the ratio between Rotten Tomato rating vs IMDB rating. What film has the highest IMDB : Rotten Tomato ratio? The lowest?

# a new column representing ratio between imdb/rt rating

df_merged['imdb_over_rt'] = df_merged.imdb_rating_percent / df_merged.rt_rating_percent

df_merged.head()

In [None]:
# Create a column that is the ratio between Rotten Tomato rating vs IMDB rating. What film has the highest IMDB : Rotten Tomato ratio? The lowest?

# the movie with the maximum imdb_over_rt value

df_merged.iloc[df_merged.imdb_over_rt.idxmax()]

In [None]:
# Create a column that is the ratio between Rotten Tomato rating vs IMDB rating. What film has the highest IMDB : Rotten Tomato ratio? The lowest?

# the movie with the minimum imdb_over_rt value

df_merged.iloc[df_merged.imdb_over_rt.idxmin()]

In [None]:
# Create your own question... then answer it! 
# What are the mean, variance and standard deviation of the rt rating for top 100 IMDB movies?

mean = df_merged.rt_rating_percent.mean(axis = 0)
std = df_merged.rt_rating_percent.std(axis = 0)
var = df_merged.rt_rating_percent.var()

print ('mean = {}, std = {}, var = {}'.format(mean, std, var))

In [None]:
# What is the relationship between IMDB ratings and Rotten Tomato ratings?

# draw a single histogram of imdb_over_rt ratio to show that vast majority of the movies have lower IMDB rating than RT rating
# x axis represents the ratio
# y axis represetns the number of movies (frequency) of a specific ratio range

import matplotlib.pyplot as plt

plt.hist(df_merged.imdb_over_rt.tolist(), bins=10, label='imdb_rating') 
plt.xlabel('imdb/rt ratio')
plt.ylabel('frequency')

In [None]:
# What is the relationship between IMDB rating and movie duration?

# Longer movies tend to have higher IMDB rating (and vice versa)

import matplotlib.pyplot as plt

plt.scatter(df_merged.duration.tolist(), df_merged.imdb_rating_percent.tolist())
plt.xlabel('Duration')
plt.ylabel('IMDB Percent')
plt.show()

In [None]:
# How many movies are there in each category?

# not sure if there are clear categories in that respect

In [None]:
# What does the distribution of Rotten Tomato ratings look like?

# The distribution is skewed towards the higher rating due to the fact that top 100 IMDB movies are considered

import matplotlib.pyplot as plt

plt.hist(df_merged.rt_rating_percent.tolist(), bins=15, label='rt_rating') 
plt.xlabel('RT Percent')
plt.ylabel('Frequency')