In [1]:
'''
CLASS: Getting Data from APIs

What is an API?
- Application Programming Interface
- Structured way to expose specific functionality and data access to users
- Web APIs usually follow the "REST" standard

How to interact with an API:
- Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response"
- Most relevant request method for us is GET (other methods: POST, PUT, DELETE)
- Response is often JSON format
- Web console is sometimes available (allows you to explore an API)
'''

'\nCLASS: Getting Data from APIs\n\nWhat is an API?\n- Application Programming Interface\n- Structured way to expose specific functionality and data access to users\n- Web APIs usually follow the "REST" standard\n\nHow to interact with an API:\n- Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response"\n- Most relevant request method for us is GET (other methods: POST, PUT, DELETE)\n- Response is often JSON format\n- Web console is sometimes available (allows you to explore an API)\n'

In [2]:
import sys

import pandas as pd
import requests

In [3]:
# read IMDb data into a DataFrame: we want a year column!
movies = pd.read_csv('../data/imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [4]:
print movies.shape
movies.describe()

(979, 6)


Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [5]:
###### exercise #######

# Is the title column unique? If not, what are the non unique names?
from collections import Counter
for title, count in Counter(movies['title']).items():
    if count > 1:
        print title

The Girl with the Dragon Tattoo
Les Miserables
True Grit
Dracula


In [6]:
# use requests library to interact with a URL http://www.omdbapi.com
r = requests.get('http://www.omdbapi.com?t=the shawshank redemption&r=json&type=movie')

In [7]:
# check the status: 200 means success, 4xx or 5xx means error
r.status_code

200

In [8]:
# view the raw response text
r.text

u'{"Title":"The Shawshank Redemption","Year":"1994","Rated":"R","Released":"14 Oct 1994","Runtime":"142 min","Genre":"Crime, Drama","Director":"Frank Darabont","Writer":"Stephen King (short story \\"Rita Hayworth and Shawshank Redemption\\"), Frank Darabont (screenplay)","Actors":"Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler","Plot":"Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.","Language":"English","Country":"USA","Awards":"Nominated for 7 Oscars. Another 19 wins & 30 nominations.","Poster":"https://images-na.ssl-images-amazon.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX300.jpg","Metascore":"80","imdbRating":"9.3","imdbVotes":"1,725,904","imdbID":"tt0111161","Type":"movie","Response":"True"}'

In [9]:
# decode the JSON response body into a dictionary
r.json()

{u'Actors': u'Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler',
 u'Awards': u'Nominated for 7 Oscars. Another 19 wins & 30 nominations.',
 u'Country': u'USA',
 u'Director': u'Frank Darabont',
 u'Genre': u'Crime, Drama',
 u'Language': u'English',
 u'Metascore': u'80',
 u'Plot': u'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
 u'Poster': u'https://images-na.ssl-images-amazon.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX300.jpg',
 u'Rated': u'R',
 u'Released': u'14 Oct 1994',
 u'Response': u'True',
 u'Runtime': u'142 min',
 u'Title': u'The Shawshank Redemption',
 u'Type': u'movie',
 u'Writer': u'Stephen King (short story "Rita Hayworth and Shawshank Redemption"), Frank Darabont (screenplay)',
 u'Year': u'1994',
 u'imdbID': u'tt0111161',
 u'imdbRating': u'9.3',
 u'imdbVotes': u'1,725,904'}

In [10]:
# extracting the year from the dictionary
r.json()['Year']

u'1994'

In [11]:
# what happens if the movie name is not recognized?
r = requests.get('http://www.omdbapi.com/?t=thebestmovieevermade&r=json&type=movie')
print r.status_code
r.json()

200


{u'Error': u'Movie not found!', u'Response': u'False'}

In [12]:
##### Exercise #####

# define a function to return the year
def get_movie_year(title):
    response = requests.get('http://www.omdbapi.com/?t='+title+'&r=json&type=movie').json()
    if 'Error' not in response: return response['Year']


In [13]:
# test the function
print get_movie_year('finding dory')
print get_movie_year('blahblahblah')

2016
None


In [14]:
# create a smaller DataFrame for testing
# the copy method makes a carbon copy of the dataframe
top_movies = movies.head().copy()

In [15]:
# write a for loop to build a list of years
from time import sleep # timey wimey stuff
years = []
for title in top_movies.title:
    years.append(get_movie_year(title))
    sleep(1)
    
# the sleep is used to not over hit the API
# this is called "rate limiting"
# Most APIs don't allow you to hit it too much

In [16]:
# assert will throw an error if the value inside is NOT True

assert(3==4)

AssertionError: 

In [17]:
# check that the DataFrame and the list of years are the same length
assert(len(top_movies) == len(years))

In [18]:
# save that list as a new column
top_movies['year'] = years
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,year
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",1994
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",1972
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",1974
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",2008
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",1994


In [19]:
'''
Bonus content: Updating the DataFrame as part of a loop
'''

# enumerate allows you to access the item location while iterating
letters = ['a', 'b', 'c']
for index, letter in enumerate(letters):
    print index, letter

0 a
1 b
2 c


In [20]:
# iterrows method for DataFrames is similar
for index, row in top_movies.iterrows():
    print index, row.title

0 The Shawshank Redemption
1 The Godfather
2 The Godfather: Part II
3 The Dark Knight
4 Pulp Fiction


In [21]:
# create a new column and set a default value
movies['yearsr'] = None
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,yearsr
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",


In [22]:
# loc method allows you to access a DataFrame element by 'label'
movies.loc[0, 'year'] = 1994
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,yearsr,year
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",,1994.0
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",,
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",,
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",,
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",,


In [23]:
# write a for loop to update the year for the first three movies
for index, row in movies.iterrows():
    if index < 3:
        movies.loc[index, 'year'] = get_movie_year(row.title)
        sleep(1)
    else:
        break

In [24]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,yearsr,year
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...",,1994.0
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",,1972.0
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv...",,1974.0
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E...",,
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L....",,
