# Extract Data

#### In this notebook we will be extracting movie related information from the OMDB API and save a csv file with original kaggle dataset combined with newly collected data in one.

1. Data source: [Kaggle "Three decades of movies"](https://www.kaggle.com/danielgrijalvas/movies)
2. API source: [OMDB API](http://www.omdbapi.com/)

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
import random
from pprint import pprint
from config import LOCAL_API_KEY
import time

In [15]:
# Visualize more columns
pd.options.display.max_columns = None

## Initial clean of the downloaded CSV

In [12]:
# Import downloaded CSV which contains a list 6820 movies
file_path = '../resources/raw_data/movies.csv'
kaggle_df = pd.read_csv(file_path, sep=',', engine='python')
kaggle_df.head()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986


In [42]:
# Describe
kaggle_df.describe()

Unnamed: 0,budget,gross,runtime,score,votes,year
count,6820.0,6820.0,6820.0,6820.0,6820.0,6820.0
mean,24581130.0,33497830.0,106.55132,6.374897,71219.52,2001.000293
std,37022540.0,58197600.0,18.02818,1.003142,130517.6,8.944501
min,0.0,70.0,50.0,1.5,27.0,1986.0
25%,0.0,1515839.0,95.0,5.8,7665.25,1993.0
50%,11000000.0,12135680.0,102.0,6.4,25892.5,2001.0
75%,32000000.0,40065340.0,115.0,7.1,75812.25,2009.0
max,300000000.0,936662200.0,366.0,9.3,1861666.0,2016.0


In [44]:
# Number of rows
len(kaggle_df)

6820

In [27]:
# Rename a few columns to make distinctions between the Kaggle CSV and dimensions
# scraped from the OMDB API
renamed_df = kaggle_df.rename(columns={'country': 'country_kaggle',
                                       'genre': 'genre_kaggle',
                                       'writer': 'writer_kaggle',
                                       'star': 'star_kaggle',
                                       'company': 'production',
                                       'score': 'score_imdb',
                                       'votes': 'votes_imdb'})
renamed_df.head(2)

Unnamed: 0,budget,production,country_kaggle,director,genre_kaggle,gross,name,rating,released,runtime,score_imdb,star_kaggle,votes_imdb,writer_kaggle,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986


In [35]:
new_df = renamed_df.copy()

# Add empty columns for dimensions we are going to scrape from OMDB API
new_df['genres_omdb'] = ''
new_df['writers_omdb'] = ''
new_df['actors_omdb'] = ''
new_df['plot'] = ''
new_df['language_omdb'] = ''
new_df['country_omdb'] = ''
new_df['awards'] = ''
new_df['poster'] = ''
new_df['score_metacritic'] = ''
new_df['type'] = ''

new_df.head(2)

Unnamed: 0,budget,production,country_kaggle,director,genre_kaggle,gross,name,rating,released,runtime,score_imdb,star_kaggle,votes_imdb,writer_kaggle,year,genres_omdb,writers_omdb,actors_omdb,plot,language_omdb,country_omdb,awards,poster,score_metacritic,type
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986,,,,,,,,,,
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986,,,,,,,,,,


## Perform API Calls to OMDB API

And save extracted information in the dataframe

In [62]:
# Set up API key for requests
api_key = LOCAL_API_KEY

In [160]:
# Use the lat/lng we recovered to identify airports

print('Beginning Data Retrieval...')
print('-----------------------------')
    
for index, row in new_df.iterrows():

    # Use a new movie name for each new api call
    movie_name = new_df.loc[index, 'name']

    # Use the search term: 'International Airport' and our lat/lng
    query_url = f'http://www.omdbapi.com/?apikey={api_key}&t={movie_name}&plot=full'

    # make request to url and conver to json
    api_data = requests.get(query_url).json()
    
    # Build some mechanisms to better 
    if index % 100 == 0:
            print(f'Processing Movie Index #{index}')
            
    if index % 250 == 0:
        time.sleep(10)
    
    # Since some data may be missing we incorporate a try-except to skip any that are missing a data point.
    try:
        new_df.loc[index, 'genres_omdb'] = api_data['Genre']
        new_df.loc[index, 'writers_omdb'] = api_data['Writer']
        new_df.loc[index, 'actors_omdb'] = api_data['Actors']
        new_df.loc[index, 'plot'] = api_data['Plot']
        new_df.loc[index, 'language_omdb'] = api_data['Language']
        new_df.loc[index, 'country_omdb'] = api_data['Country']
        new_df.loc[index, 'awards'] = api_data['Awards']
        new_df.loc[index, 'poster'] = api_data['Poster']
        new_df.loc[index, 'score_metacritic'] = api_data['Metascore']
        new_df.loc[index, 'type'] = api_data['Type']
        
    except (KeyError, IndexError):
        print('Missing field/result... skipping.')

print('-----------------------------')
print('Data Retrieval Complete.') 
print('-----------------------------')

Beginning Data Retrieval...
-----------------------------
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Processing Movie Index #6700
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Missing field/result... skipping.
Processing Movie Index #6800
Missing field/result... skipping.
-----------------------------
Data Retrieval Complete.
-----------------------------


In [171]:
new_df

Unnamed: 0,budget,production,country_kaggle,director,genre_kaggle,gross,name,rating,released,runtime,score_imdb,star_kaggle,votes_imdb,writer_kaggle,year,genres_omdb,writers_omdb,actors_omdb,plot,language_omdb,country_omdb,awards,poster,score_metacritic,type
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986,"Adventure, Drama","Stephen King (novel), Raynold Gideon (screenpl...","Wil Wheaton, River Phoenix, Corey Feldman, Jer...","It's the summer of 1959 in Castlerock, Oregon ...",English,USA,Nominated for 1 Oscar. Another 5 wins & 10 nom...,https://m.media-amazon.com/images/M/MV5BODJmY2...,75,movie
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986,Comedy,John Hughes,"Matthew Broderick, Alan Ruck, Mia Sara, Jeffre...",High school student Ferris Bueller wants a day...,"English, German",USA,Nominated for 1 Golden Globe. Another 2 wins.,https://m.media-amazon.com/images/M/MV5BMDA0Nj...,61,movie
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986,"Action, Drama","Jim Cash, Jack Epps Jr., Ehud Yonay (magazine ...","Tom Cruise, Kelly McGillis, Val Kilmer, Anthon...","Lieutenant Pete ""Maverick"" Mitchell is an expe...",English,USA,Won 1 Oscar. Another 10 wins & 5 nominations.,https://m.media-amazon.com/images/M/MV5BZjQxYT...,50,movie
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986,"Action, Adventure, Sci-Fi, Thriller","James Cameron (story by), David Giler (story b...","Sigourney Weaver, Carrie Henn, Michael Biehn, ...",Fifty seven years after Ellen Ripley survived ...,English,"UK, USA",Won 2 Oscars. Another 18 wins & 23 nominations.,https://m.media-amazon.com/images/M/MV5BZGU2OG...,84,movie
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986,"Adventure, Comedy, Family, Sci-Fi","Mark H. Baker (story), Michael Burton (screenp...","Joey Cramer, Paul Reubens, Cliff De Young, Ver...","A 12-year-old boy goes missing in 1978, only t...",English,USA,4 nominations.,https://m.media-amazon.com/images/M/MV5BMjUwNm...,64,movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,0.0,Fox Searchlight Pictures,UK,Mandie Fletcher,Comedy,4750497.0,Absolutely Fabulous: The Movie,R,2016-07-22,91,5.4,Jennifer Saunders,9161,Jennifer Saunders,2016,"Comedy, Crime",Jennifer Saunders (screenplay),"Lulu, Gwendoline Christie, Jennifer Saunders, ...",Edina and Patsy are still oozing glitz and gla...,"English, French","UK, USA",1 win & 7 nominations.,https://m.media-amazon.com/images/M/MV5BMjI4ND...,59,movie
6816,0.0,Siempre Viva Productions,USA,Paul Duddridge,Drama,28368.0,Mothers and Daughters,PG-13,2016-05-06,90,4.9,Selma Blair,1959,Paige Cameron,2016,Drama,"Paige Cameron, Paul Duddridge","Selma Blair, Luke Mitchell, Symmetry",Interwoven stories of what it is to be a mom s...,English,United States,1 win & 2 nominations,https://m.media-amazon.com/images/M/MV5BNDQyMD...,29,movie
6817,3500000.0,Warner Bros. Animation,USA,Sam Liu,Animation,3775000.0,Batman: The Killing Joke,R,2016-07-25,76,6.5,Kevin Conroy,36333,Brian Azzarello,2016,"Animation, Action, Crime, Drama, Thriller","Brian Azzarello, Brian Bolland (based on the g...","Kevin Conroy, Mark Hamill, Tara Strong, Ray Wise","As Batman hunts for the escaped Joker, the Clo...",English,USA,1 win & 2 nominations.,https://m.media-amazon.com/images/M/MV5BMTdjZT...,,movie
6818,0.0,Borderline Presents,USA,Nicolas Pesce,Drama,25981.0,The Eyes of My Mother,R,2016-12-02,76,6.2,Kika Magalh�es,6947,Nicolas Pesce,2016,"Drama, Horror, Thriller",Nicolas Pesce,"Diana Agostini, Olivia Bond, Will Brill, Joey ...","In their secluded farmhouse, a mother, formerl...","English, Portuguese",USA,5 wins & 14 nominations.,https://m.media-amazon.com/images/M/MV5BMTcyMD...,63,movie


In [151]:
# Testing...
# ........

# Request data
movie= 'The Men Who Stare at Goats'
test_url = f'http://www.omdbapi.com/?apikey={api_key}&t={movie}&plot=full'
response = requests.get(test_url)
data = response.json()

pprint(data)

{'Actors': 'Ewan McGregor, George Clooney, Kevin Spacey',
 'Awards': '1 win & 1 nomination',
 'BoxOffice': '$32,428,195',
 'Country': 'United States, United Kingdom',
 'DVD': '31 Mar 2017',
 'Director': 'Grant Heslov',
 'Genre': 'Comedy, War',
 'Language': 'English, Arabic',
 'Metascore': '54',
 'Plot': 'A reporter, trying to lose himself in the romance of war after his '
         'marriage fails, gets more than he bargains for when he meets a '
         'special forces agent who reveals the existence of a secret, psychic '
         'military unit whose goal is to end war as we know it. The founder of '
         'the unit has gone missing and the trail leads to another psychic '
         'soldier who has distorted the mission to serve his own ends.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMjIwOTQwNzg1MV5BMl5BanBnXkFtZTcwODc4MDU4Mg@@._V1_SX300.jpg',
 'Production': 'Smoke House, Paul Lister',
 'Rated': 'R',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '6.2/10'

In [47]:
pprint(data)

{'Actors': 'Jeff Goldblum, Geena Davis, John Getz, Joy Boushel',
 'Awards': 'Won 1 Oscar. Another 6 wins & 10 nominations.',
 'BoxOffice': '$40,456,565',
 'Country': 'USA, UK, Canada',
 'DVD': '25 Nov 2015',
 'Director': 'David Cronenberg',
 'Genre': 'Drama, Horror, Sci-Fi',
 'Language': 'English',
 'Metascore': '79',
 'Plot': 'Seth Brundle (Jeff Goldblum), a brilliant but eccentric scientist '
         'attempts to woo investigative journalist Veronica Quaife (Geena '
         'Davis) by offering her a scoop on his latest research in the field '
         'of matter transportation, which against all the expectations of the '
         'scientific establishment have proved successful. Up to a point. '
         'Brundle thinks he has ironed out the last problem when he '
         'successfully transports a living creature, but when he attempts to '
         'teleport himself a fly enters one of the transmission booths, and '
         'Brundle finds he is a changed man. This Science-Gone-M

## Now we export the new dataframe with newly collected columns

In [162]:
# Export new dataset to CSV
final_df = new_df.copy()
final_df.to_csv('../resources/cleaned_data/movies_complete.csv', index=False)

In [173]:
final_df

Unnamed: 0,budget,production,country_kaggle,director,genre_kaggle,gross,name,rating,released,runtime,score_imdb,star_kaggle,votes_imdb,writer_kaggle,year,genres_omdb,writers_omdb,actors_omdb,plot,language_omdb,country_omdb,awards,poster,score_metacritic,type
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986,"Adventure, Drama","Stephen King (novel), Raynold Gideon (screenpl...","Wil Wheaton, River Phoenix, Corey Feldman, Jer...","It's the summer of 1959 in Castlerock, Oregon ...",English,USA,Nominated for 1 Oscar. Another 5 wins & 10 nom...,https://m.media-amazon.com/images/M/MV5BODJmY2...,75,movie
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986,Comedy,John Hughes,"Matthew Broderick, Alan Ruck, Mia Sara, Jeffre...",High school student Ferris Bueller wants a day...,"English, German",USA,Nominated for 1 Golden Globe. Another 2 wins.,https://m.media-amazon.com/images/M/MV5BMDA0Nj...,61,movie
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986,"Action, Drama","Jim Cash, Jack Epps Jr., Ehud Yonay (magazine ...","Tom Cruise, Kelly McGillis, Val Kilmer, Anthon...","Lieutenant Pete ""Maverick"" Mitchell is an expe...",English,USA,Won 1 Oscar. Another 10 wins & 5 nominations.,https://m.media-amazon.com/images/M/MV5BZjQxYT...,50,movie
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986,"Action, Adventure, Sci-Fi, Thriller","James Cameron (story by), David Giler (story b...","Sigourney Weaver, Carrie Henn, Michael Biehn, ...",Fifty seven years after Ellen Ripley survived ...,English,"UK, USA",Won 2 Oscars. Another 18 wins & 23 nominations.,https://m.media-amazon.com/images/M/MV5BZGU2OG...,84,movie
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986,"Adventure, Comedy, Family, Sci-Fi","Mark H. Baker (story), Michael Burton (screenp...","Joey Cramer, Paul Reubens, Cliff De Young, Ver...","A 12-year-old boy goes missing in 1978, only t...",English,USA,4 nominations.,https://m.media-amazon.com/images/M/MV5BMjUwNm...,64,movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,0.0,Fox Searchlight Pictures,UK,Mandie Fletcher,Comedy,4750497.0,Absolutely Fabulous: The Movie,R,2016-07-22,91,5.4,Jennifer Saunders,9161,Jennifer Saunders,2016,"Comedy, Crime",Jennifer Saunders (screenplay),"Lulu, Gwendoline Christie, Jennifer Saunders, ...",Edina and Patsy are still oozing glitz and gla...,"English, French","UK, USA",1 win & 7 nominations.,https://m.media-amazon.com/images/M/MV5BMjI4ND...,59,movie
6816,0.0,Siempre Viva Productions,USA,Paul Duddridge,Drama,28368.0,Mothers and Daughters,PG-13,2016-05-06,90,4.9,Selma Blair,1959,Paige Cameron,2016,Drama,"Paige Cameron, Paul Duddridge","Selma Blair, Luke Mitchell, Symmetry",Interwoven stories of what it is to be a mom s...,English,United States,1 win & 2 nominations,https://m.media-amazon.com/images/M/MV5BNDQyMD...,29,movie
6817,3500000.0,Warner Bros. Animation,USA,Sam Liu,Animation,3775000.0,Batman: The Killing Joke,R,2016-07-25,76,6.5,Kevin Conroy,36333,Brian Azzarello,2016,"Animation, Action, Crime, Drama, Thriller","Brian Azzarello, Brian Bolland (based on the g...","Kevin Conroy, Mark Hamill, Tara Strong, Ray Wise","As Batman hunts for the escaped Joker, the Clo...",English,USA,1 win & 2 nominations.,https://m.media-amazon.com/images/M/MV5BMTdjZT...,,movie
6818,0.0,Borderline Presents,USA,Nicolas Pesce,Drama,25981.0,The Eyes of My Mother,R,2016-12-02,76,6.2,Kika Magalh�es,6947,Nicolas Pesce,2016,"Drama, Horror, Thriller",Nicolas Pesce,"Diana Agostini, Olivia Bond, Will Brill, Joey ...","In their secluded farmhouse, a mother, formerl...","English, Portuguese",USA,5 wins & 14 nominations.,https://m.media-amazon.com/images/M/MV5BMTcyMD...,63,movie
