In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
movies = pd.read_csv('../data/preprocessed/movies_id_updated.csv')

In [3]:
movies

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtPictureURL
0,1,Toy story,tt0114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995.0,toy_story,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,tt0113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995.0,1068044-jumanji,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,tt0107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993.0,grumpy_old_men,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,tt0114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995.0,waiting_to_exhale,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,tt0113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995.0,father_of_the_bride_part_ii,http://content8.flixster.com/movie/25/54/25542...
...,...,...,...,...,...,...,...,...
10192,65088,Bedtime Stories,tt0960731,Más allá de los sueños,http://ia.media-imdb.com/images/M/MV5BMjA5Njk5...,2008.0,bedtime_stories,http://content6.flixster.com/movie/10/94/33/10...
10193,65091,Manhattan Melodrama,tt0025464,El enemigo público número 1,http://ia.media-imdb.com/images/M/MV5BMTUyODE3...,1934.0,manhattan_melodrama,http://content9.flixster.com/movie/66/44/64/66...
10194,65126,Choke,tt1024715,Choke,http://ia.media-imdb.com/images/M/MV5BMTMxMDI4...,2008.0,choke,http://content6.flixster.com/movie/10/85/09/10...
10195,65130,Revolutionary Road,tt0959337,Revolutionary Road,http://ia.media-imdb.com/images/M/MV5BMTI2MzY2...,2008.0,revolutionary_road,http://content8.flixster.com/movie/10/88/40/10...


In [4]:
def get_features(api_key, ID):
    """
    api_key: key for accessing the API
    ID: selects which slice of IDs to download
    """
    omdb_features = pd.DataFrame()    
    counter = 0
    # iterate over each 900 imdb IDs, specified by param ID
    for i in movies['imdbID'][ID*900:(ID+1)*900]:
        try:
            # make request to omdb API
            #print("Request for: imdbID" +str(i))
            req = requests.get('http://www.omdbapi.com/?apikey='+api_key+'&i='+i)
            # convert into json
            data = req.json()
            # convert into dataframe
            feature = pd.DataFrame(data)
            # Several ratings are available in column 'Ratings' (max 3) which creates a row for each rating: Select each rating, create individual column and add to feature dataframe 
            
            if (len(feature['Ratings'])>0):
                dummy = pd.DataFrame(feature['Ratings'][0], index=[0]).pivot(columns='Source', values='Value')
                for k in range(1,feature.shape[0]):
                    dummy = dummy.join(pd.DataFrame(feature['Ratings'][k], index=[0]).pivot(columns='Source', values='Value'))
                feature = feature.head(1).join(dummy)
            
            # delete unnecessary column
            del feature['Ratings']
            # append to large df
            omdb_features = omdb_features.append(feature)
            counter+=1
            if counter % 100 == 0 :
                print('number of retrieved IDs: ', counter)
        except ValueError:
            print('No entry found for ID ',i)
            print(data)
            counter+=1

    # rename columns
    feature.rename(columns={'Internet Movie Database':'Rating_IMDB', 'Rotten Tomatoes': 'Rating_Rotten_Tomatoes', 'Metacritic':'Rating_Metacritic'})
    # save to csv, according to which part of the IDs
    omdb_features.to_csv('../data/raw/omdb_'+str(ID)+'.csv')

Insert your api key and run one cell a day

Max: ID 0-2

In [42]:
get_features(api_key='57aa589f', ID=0)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [5]:
get_features(api_key = '57aa589f', ID=1)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [10]:
get_features(api_key = '694aa939', ID=2)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
No entry found for ID  tt0800175
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800


In [6]:
#troubleshooting
get_features(api_key = 'ff8f29d6', ID=9)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [5]:
get_features(api_key = 'e1382141', ID=3, sort='True')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [12]:
get_features(api_key = 'e1382141', ID=4)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [46]:
get_features(api_key = 'e1382141', ID=5)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


Felix: ID 6-7

In [8]:
get_features(api_key = '57aa589f', ID=6)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [9]:
get_features(api_key = '694aa939', ID=7)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


Christin: ID 8-9

In [6]:
get_features(api_key = 'ff8f29d6', ID=8)

number of retrieved IDs:  100
No entry found for ID  tt0054462
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
No entry found for ID  tt0439663
number of retrieved IDs:  900


In [5]:
get_features(api_key = 'ff8f29d6', ID=9)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


Carmen: ID 10-11

In [5]:
get_features(api_key = '4c3b1f8c', ID=10)

number of retrieved IDs:  100
number of retrieved IDs:  200
number of retrieved IDs:  300
number of retrieved IDs:  400
number of retrieved IDs:  500
number of retrieved IDs:  600
number of retrieved IDs:  700
number of retrieved IDs:  800
number of retrieved IDs:  900


In [7]:
get_features(api_key = '4c3b1f8c', ID=11)

number of retrieved IDs:  100
number of retrieved IDs:  200


Try to access three previously not available IDs

In [10]:
omdb_features = pd.DataFrame()    
counter = 0
# iterate over the three not available movies
for i in ['tt0800175', 'tt0054462', 'tt0439663']:
    try:
        # make request to omdb API
        #print("Request for: imdbID" +str(i))
        req = requests.get('http://www.omdbapi.com/?apikey=e1382141&i='+i)
        # convert into json
        data = req.json()
        # convert into dataframe
        feature = pd.DataFrame(data)
        # Several ratings are available in column 'Ratings' (max 3) which creates a row for each rating: Select each rating, create individual column and add to feature dataframe 

        if (len(feature['Ratings'])>0):
            dummy = pd.DataFrame(feature['Ratings'][0], index=[0]).pivot(columns='Source', values='Value')
            for k in range(1,feature.shape[0]):
                dummy = dummy.join(pd.DataFrame(feature['Ratings'][k], index=[0]).pivot(columns='Source', values='Value'))
            feature = feature.head(1).join(dummy)

        # delete unnecessary column
        del feature['Ratings']
        # append to large df
        omdb_features = omdb_features.append(feature)
        counter+=1
        if counter % 100 == 0 :
            print('number of retrieved IDs: ', counter)
    except ValueError:
        print('No entry found for ID ',i)
        print(data)
        counter+=1

# rename columns
feature.rename(columns={'Internet Movie Database':'Rating_IMDB', 'Rotten Tomatoes': 'Rating_Rotten_Tomatoes', 'Metacritic':'Rating_Metacritic'})
# save to csv, according to which part of the IDs
omdb_features.to_csv('../data/raw/omdb_'+str(12)+'.csv')

No entry found for ID  tt0800175
{'Response': 'False', 'Error': 'Error getting data.'}
No entry found for ID  tt0439663
{'Response': 'False', 'Error': 'Error getting data.'}


If all features downloaded, aggregated via:

In [18]:
all_omdb_features = pd.DataFrame()
for i in range(0,13):
    all_omdb_features = all_omdb_features.append(pd.read_csv('../data/raw/omdb_'+str(i)+'.csv'))
all_omdb_features = all_omdb_features.reset_index(drop=True)
del all_omdb_features['Unnamed: 0']
all_omdb_features.to_csv('../data/raw/omdb_total.csv')

In [19]:
all_omdb_features

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Production,Website,Response,Internet Movie Database,Rotten Tomatoes,Metacritic,totalSeasons,Season,Episode,seriesID
0,Toy Story,1995,G,22 Nov 1995,81 min,"Animation, Adventure, Comedy, Family, Fantasy",John Lasseter,"John Lasseter (original story by), Pete Docter...","Tom Hanks, Tim Allen, Don Rickles, Jim Varney",A cowboy doll is profoundly threatened and jea...,...,,,True,8.3/10,100%,95/100,,,,
1,Jumanji,1995,PG,15 Dec 1995,104 min,"Adventure, Comedy, Family, Fantasy",Joe Johnston,"Jonathan Hensleigh (screenplay by), Greg Taylo...","Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",When two kids find and play a magical board ga...,...,,,True,7.0/10,54%,39/100,,,,
2,Grumpy Old Men,1993,PG-13,25 Dec 1993,103 min,"Comedy, Drama, Romance",Donald Petrie,Mark Steven Johnson,"Jack Lemmon, Walter Matthau, Ann-Margret, Burg...",A lifelong feud between two neighbors since ch...,...,,,True,7.0/10,63%,53/100,,,,
3,Waiting to Exhale,1995,R,22 Dec 1995,124 min,"Comedy, Drama, Romance",Forest Whitaker,"Terry McMillan (novel), Terry McMillan (screen...","Whitney Houston, Angela Bassett, Loretta Devin...","Based on Terry McMillan's novel, this film fol...",...,,,True,5.9/10,56%,,,,,
4,Father of the Bride Part II,1995,PG,08 Dec 1995,106 min,"Comedy, Family, Romance",Charles Shyer,"Albert Hackett (screenplay ""Father's Little Di...","Steve Martin, Diane Keaton, Martin Short, Kimb...",George Banks must deal not only with the pregn...,...,,,True,6.0/10,48%,49/100,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10187,Manhattan Melodrama,1934,NOT RATED,04 May 1934,93 min,"Crime, Drama, Romance","W.S. Van Dyke, George Cukor","Oliver H.P. Garrett (screen play), Joseph L. M...","Clark Gable, William Powell, Myrna Loy, Leo Ca...",The friendship between two orphans endures eve...,...,,,True,7.2/10,80%,,,,,
10188,Choke,2008,R,26 Sep 2008,92 min,"Comedy, Drama",Clark Gregg,"Clark Gregg (screenplay), Chuck Palahniuk (novel)","Kathryn Alexander, Teodorina Bello, Kate Blumb...",A sex-addicted con-man pays for his mother's h...,...,,,True,6.4/10,55%,47/100,,,,
10189,Revolutionary Road,2008,R,23 Jan 2009,119 min,"Drama, Romance",Sam Mendes,"Justin Haythe (screenplay), Richard Yates (novel)","Kate Winslet, Leonardo DiCaprio, Christopher F...",A young couple living in a Connecticut suburb ...,...,,,True,7.3/10,67%,69/100,,,,
10190,Blackadder Back & Forth,1999,,06 Dec 1999,33 min,"Short, Comedy, History, Sci-Fi",Paul Weiland,"Richard Curtis, Ben Elton","Rowan Atkinson, Tony Robinson, Stephen Fry, Hu...","At a New Millennium Eve party, Blackadder and ...",...,,,True,7.7/10,,,,,,
