## Problem description

### Daily ID file exports

Retrieve the daily ID file exports, containing a list of the valid IDs that can be found on TMDb and some higher level attributes that are helpful for filtering items like the adult, video and popularity values.

In [1]:
ids_path = "./ids/"
ids_archives_path = ids_path + "/archives/"

In [2]:
import requests
from datetime import date

today = date.today()
dt = today.strftime("%m_%d_%Y")
id_urls = {
    "movies" : "http://files.tmdb.org/p/exports/movie_ids_{0}.json.gz".format(dt),
    "tv_series" : "http://files.tmdb.org/p/exports/tv_series_ids_{0}.json.gz".format(dt),
    "people" : "http://files.tmdb.org/p/exports/person_ids_{0}.json.gz".format(dt),
    "collections" : "http://files.tmdb.org/p/exports/collection_ids_{0}.json.gz".format(dt),
    "tv_networks" : "http://files.tmdb.org/p/exports/tv_network_ids_{0}.json.gz".format(dt),
    "keywords" : "http://files.tmdb.org/p/exports/keyword_ids_{0}.json.gz".format(dt),
    "production_companies" : "http://files.tmdb.org/p/exports/production_company_ids_{0}.json.gz".format(dt),
}

#Retrieve all the archives containing the resources IDs
for name, url in id_urls.items():
    r = requests.get(url)
    with open(ids_archives_path + name +".ids.json.gz", 'wb') as f:
        f.write(r.content)

In [3]:
#List all the .gz files inside the IDs folder
from os import listdir
from os.path import isfile, join

id_archives = [f for f in listdir(ids_archives_path) if isfile(join(ids_archives_path, f)) and f.endswith(".gz")]

In [4]:
import gzip
import shutil

#Extract all the .gz files
for id_archive in id_archives:
    with open(ids_path + id_archive.replace(".gz", ""), 'wb+') as f_out:
        with gzip.open(ids_archives_path + id_archive, 'rb') as f_in:
            shutil.copyfileobj(f_in, f_out)

### Call

Now we have an updated and comprehensive list of IDs. We will call some APIs function to retrieve more info on these IDs.

In [5]:
!pip install tmdbsimple

Collecting tmdbsimple
  Downloading tmdbsimple-2.7.0-py3-none-any.whl (38 kB)
Installing collected packages: tmdbsimple
Successfully installed tmdbsimple-2.7.0


## Data gathering

In [19]:
import tmdbsimple as tmdb

movies_path = "./movies/"

tmdb.API_KEY = '34ed3ca4d3adae63d1bed1b1598bfbbb'
search = tmdb.Discover()
response = search.movie(primary_release_year='2020')
print("Total results:", response['total_results'])
print("Page {0} of {1}".format(response['page'], response['total_pages']))


Total results: 10000
Page 1 of 500


{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/cjaOSjsjV6cl3uXdJqimktT880L.jpg',
   'genre_ids': [12, 14, 10751, 16],
   'id': 529203,
   'original_language': 'en',
   'original_title': 'The Croods: A New Age',
   'overview': 'After leaving their cave, the Croods encounter their biggest threat since leaving: another family called the Bettermans, who claim and show to be better and evolved. Grug grows suspicious of the Betterman parents, Phil and Hope,  as they secretly plan to break up his daughter Eep with her loving boyfriend Guy to ensure that their daughter Dawn has a loving and smart partner to protect her.',
   'popularity': 1937.566,
   'poster_path': '/tK1zy5BsCt1J4OzoDicXmr0UTFH.jpg',
   'release_date': '2020-11-25',
   'title': 'The Croods: A New Age',
   'video': False,
   'vote_average': 8.1,
   'vote_count': 397},
  {'adult': False,
   'backdrop_path': '/wk58aoyWpMTVkKkdjw889XfWGdL.jpg',
   'genre_ids': [53, 80, 9648],
   'id': 646593,
   'original_languag

In [3]:
# uncomment to install
import sys
from pymongo import MongoClient
import json

client = MongoClient('localhost', 27017, username='mongoadmin', password='pass1234')

# let's create a new DB - bda
database = client['bda']

# let's create a collection
movies = database.movies



## Data description
## Data cleaning
## Data analysis
## Conclusions

In [1]:
import sys
from pymongo import MongoClient
import json

client = MongoClient('localhost', 27017, username='mongoadmin', password='pass1234')
database = client['bda']

In [2]:
import pandas as pd
import numpy as np

In [3]:
cursor = movies.find({},{ 'adult': 0, 'backdrop_path': 0, 'original_title': 0, 'overview': 0, 'poster_path': 0, 'video': 0, '_id': 0 })
moviesdf = pd.DataFrame(list(cursor))

NameError: name 'movies' is not defined