## Fetch External Features
#### Import package

In [1]:
import datetime
import dask.dataframe as dd
import json
import os
import pandas as pd
import requests

from dask.diagnostics import ProgressBar
from dask.multiprocessing import get

#### Read TMDB API key

In [2]:
with open('../api/tmdb.json') as api:
    api_key = json.load(api)['api']

#### Read training and testing dataset

In [3]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv(dataset_folder + 'train.csv', index_col=0)
test = pd.read_csv(dataset_folder + 'test.csv', index_col=0)

dataset = pd.concat([train, test], ignore_index=True, sort=True)
dataset = dataset[train.columns]

In [8]:
dataset = dataset[:10]
dataset.shape

(10, 22)

#### Retreive features 'vote_average' and 'vote_count' using TMDB API

In [9]:
def function(x):
    response = requests.get('https://api.themoviedb.org/3/movie/' + 
                            x + '?api_key=' + api_key).json()

    vote_average = 0
    vote_count = 0
    
    if 'vote_average' in response:
        vote_average = response['vote_average']
    if 'vote_count' in response:
        vote_count = response['vote_count']
    
    return pd.Series({'vote_average': vote_average, 
                      'vote_count': vote_count})

#### Partition dataset using dask.dataframe and apply each partition

In [17]:
ddata = dd.from_pandas(dataset['imdb_id'], npartitions=50)
with ProgressBar():
    external_features= ddata.map_partitions(lambda df: df.apply(function)).compute(get=get)  

[########################################] | 100% Completed | 11.1s


#### Write external features to file

In [18]:
time = datetime.datetime.now()
time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)

external_features.to_csv(dataset_folder + time + '_external_features.csv', index=None)