# Imports

The imports are necessary for all three stages below, but each stage should be able to be run given that the previous stage has been completed.

In [1]:
import pandas as pd
import numpy as np

import random
import json
import os
import os.path as osp
import datetime

from youtube_dl import YoutubeDL

from movie_data_scraper import TMDB_Scraper

from google_auth_oauthlib.flow import InstalledAppFlow
from apiclient.discovery import build
from apiclient.http import MediaFileUpload
from apiclient import errors

yt_url_prefix = 'https://www.youtube.com/watch?v='
output_folder = 'data/'
tmdb_scraper = TMDB_Scraper(api_key="abd17a9f250807b76ebbfa9997ca6ade")

# 1) Find Movie Titles

Do a preliminary search of the results we want to aggregate using tMDb discover API, storing the total number of pages so we can aggregate all of them by iterating later.

In [3]:
discover_query_results = tmdb_scraper.run_api("discover/movie", js_query_args={
    'sort_by': 'vote_average.desc',
    'vote_count.gte': 2000
})
num_pages = discover_query_results['total_pages']

Iterate through and aggregate all results

In [4]:
def get_movie_ids(cache_file=None):
    """Get the TMDb trailer IDs of the movies we want to analyze. Currently, the default setting collects
    highest rated movies. The optional cache_file argument will allow the user to, instead of collecting the
    most up-to-date information, use a cached and dated version."""
    if cache_file:
        return np.load(get_tmdb_cache_path(cache_file))
    else:
        trailer_ids = []
        for p in range(1, num_pages+1):
            this_page_results = tmdb_scraper.run_api("discover/movie", js_query_args={
                'sort_by': 'vote_average.desc',
                'vote_count.gte': 2000,
                'page': p
            })
            print(f"Aggregating movie IDs from page {p} of {num_pages}", end='\r')
            trailer_ids.extend(m['id'] for m in this_page_results['results'])
        return np.array(trailer_ids)
        
def get_tmdb_cache_path(cache_file):
    return output_folder + 'tmdb_id_history/' + cache_file + ".npy"

In [5]:
tmdb_ids = get_movie_ids('2019-10-30_01-28-00')
#tmdb_ids = get_trailer_ids() # Use this line instead to get new, current data for new analyses

In [9]:
# Uncomment to save these TMDb id's with current datetime.
# np.save(get_tmdb_cache_path(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")), tmdb_ids)

# 2) Aggregate Movie Metadata, Crew Metadata, and Trailer Links

Collect Data in-memory

In [33]:
# Make Blank DataFrame to append results to
all_trailers = pd.DataFrame({"tmdb_id": [], 'tmdb_title': [], "trailer_title": [], "trailer_youtube_key": []})
# Make empty json's to append results to
movie_details = {}
movie_crew = {}

for i, tmdb_id in enumerate(tmdb_ids):
    print(f"Aggregating data from of movie {i} of {len(tmdb_ids)}", end='\r')
    this_movie_details = tmdb_scraper.movie_details(tmdb_id)
    movie_details[str(tmdb_id)] = this_movie_details
    movie_crew[str(tmdb_id)] = tmdb_scraper.movie_crew(tmdb_id)
    videos = tmdb_scraper.movie_videos(tmdb_id)
    trailers = [v for v in videos['results'] if v['type'] == 'Trailer']
    for t in trailers:
        all_trailers = all_trailers.append({
            "tmdb_id": this_movie_details['id'],
            'tmdb_title': this_movie_details['title'],
            'trailer_title': t['name'],
            'trailer_youtube_key': t['key']
        }, ignore_index=True)

Aggregating data from of movie 1186 of 1187

Save Data!

In [34]:
all_trailers.to_csv(osp.join(output_folder, "trailers.csv"))
with open(osp.join(output_folder, 'movie_details.json'), 'w') as outfile:
    json.dump(movie_details, outfile)

with open(osp.join(output_folder, 'movie_crew.json'), 'w') as outfile:
    json.dump(movie_crew, outfile)

# 3) Download Movie Trailers to Local Drive

In [18]:
with open(osp.join(output_folder, 'movie_details.json'), 'r') as outfile:
    movie_details = json.load(outfile)

with open(osp.join(output_folder, 'movie_crew.json'), 'r') as outfile:
    movie_crew = json.load(outfile)
    
trailers = pd.read_csv(osp.join(output_folder, 'trailers.csv'))

In [9]:
def download_trailers(dataset_dir, ydl_opts={}, verbose=False):
    ydl_opts['outtmpl'] = dataset_dir + '%(id)s.%(ext)s'
    downloaded = os.listdir(dataset_dir)
    downloaded = [d.split('.')[0] for d in downloaded]
    downloaded = set(downloaded)
    all_to_download = set(trailers['trailer_youtube_key'])
    remaining_trailers = all_to_download - downloaded
    if verbose:
        print(f'remaining trailers (len({len(remaining_trailers)})) {remaining_trailers}')
    with YoutubeDL(ydl_opts) as ydl:
        for t in remaining_trailers:
            try:
                # Helpful Constants
                yt_link = yt_url_prefix + t

                # Download Video
                print(f"Downloading {t}")
                trailer_yt_info = ydl.extract_info(yt_link, download=True)
            except Exception as e:
                print(f"Failed with exception")
                print(e)
            print()

In [None]:
def download_yt_videos(dataset_dir, yt_ids, ydl_opts={}, verbose=False):
    ydl_opts['outtmpl'] = dataset_dir + '%(id)s.%(ext)s'
    downloaded = os.listdir(dataset_dir)
    downloaded = [d.split('.')[0] for d in downloaded]
    downloaded = set(downloaded)
    all_to_download = set(yt_ids)
    remaining_trailers = all_to_download - downloaded
    if verbose:
        print(f'remaining trailers (len({len(remaining_trailers)})) {remaining_trailers}')
    with YoutubeDL(ydl_opts) as ydl:
        for i, t in enumerate(remaining_trailers):
            try:
                # Helpful Constants
                yt_link = yt_url_prefix + t

                # Download Video
                print(f"Downloading {t} - {i}/{len(remaining_trailers)}")
                trailer_yt_info = ydl.extract_info(yt_link, download=True)
            except Exception as e:
                print(f"Failed with exception")
                print(e)
            print()

In [20]:
download_yt_videos("../data/", trailers['trailer_youtube_key'], {'format': 'mp4/bestvideo'}, verbose=True)

# 4) Download Casual Videos

In [11]:
with open("./casual_videos.txt") as f:
    casual_yt_ids = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
casual_yt_ids = [l.strip()[-11:] for l in casual_yt_ids if 'https' in l]
casual_yt_ids

['a7JUoSiJG-U',
 'W0foraj84oU',
 'rtMSMWm-xw0',
 'WE5uaL0urok',
 'APMcESZjelM',
 'FZ7wBmLvcSA',
 'Uy6gWBGP9k4',
 '4hUhAFVpip8',
 'tFNT50-fCGk',
 'bHKQfui91yc',
 'J_EDZOodc14',
 'x2irNRNLrrI',
 'ggNIcxiUv6Q',
 'b9lpQ3P07Ds',
 'yAFmSm5l8gE',
 'sCFi2O1vyWQ',
 'kQM6Q9Axyx0',
 'GK2_2dUxtLI',
 'O2VSHC9DFbQ',
 'I_1oQYpONjs']

In [21]:
download_yt_videos("../data/casual/", casual_yt_ids, {'format': 'mp4/bestvideo'}, verbose=True)

remaining trailers (len(6)) {'4hUhAFVpip8', 'tFNT50-fCGk', 'ggNIcxiUv6Q', 'sCFi2O1vyWQ', 'bHKQfui91yc', 'APMcESZjelM'}
Downloading 4hUhAFVpip8
[youtube] 4hUhAFVpip8: Downloading webpage
[youtube] 4hUhAFVpip8: Downloading video info webpage
[download] Destination: ../data/casual/4hUhAFVpip8.mp4
[download] 100% of 18.11MiB in 00:1709MiB/s ETA 00:000

Downloading tFNT50-fCGk
[youtube] tFNT50-fCGk: Downloading webpage
[youtube] tFNT50-fCGk: Downloading video info webpage
[download] Destination: ../data/casual/tFNT50-fCGk.mp4
[download] 100% of 105.06MiB in 01:3610MiB/s ETA 00:009

Downloading ggNIcxiUv6Q
[youtube] ggNIcxiUv6Q: Downloading webpage
[youtube] ggNIcxiUv6Q: Downloading video info webpage
[youtube] ggNIcxiUv6Q: Downloading js player vflq5GyJR
[youtube] ggNIcxiUv6Q: Downloading js player vflq5GyJR
[download] Destination: ../data/casual/ggNIcxiUv6Q.mp4
[download] 100% of 55.38MiB in 00:5010MiB/s ETA 00:001

Downloading sCFi2O1vyWQ
[youtube] sCFi2O1vyWQ: Downloading webpage
[youtub

# 5) Upload Movie Trailers to Google Drive (optional)

In [2]:
# Uncomment to install dependencies
#!pip install pandas
#!pip install youtube_dl
#!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
# GOOGLE DRIVE API CONSTANTS
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', ['https://www.googleapis.com/auth/drive'])
creds = flow.run_local_server(port=0)
folder_id = '1UnDIe4VHMM8bZzIfKEG8NJdQGnNrrxEG'
drive_api = build('drive', 'v3', credentials=creds)

In [19]:
# Helpful function
def get_file_list_from_folder(service, folder_id):
    """Print files belonging to a folder.

    Args:
    service: Drive API service instance.
    folder_id: ID of the folder to print files from.
    """
    kwargs = {
        "q": "'{}' in parents".format(folder_id)
    }
    request = service.files().list(**kwargs)
    files = []
    while request is not None:
        response = request.execute()
        # Do stuff with response['files']
        files.extend(response['files'])
        request = service.files().list_next(request, response)
    return files

In [24]:
trailers = pd.read_csv(osp.join(output_folder, "trailers.csv"), index_col=0)
trailers['movie_title'].unique().size

1171

In [None]:
ydl_opts = {
    'outtmpl': output_folder + '%(id)s.%(ext)s',
    'format': 'bestvideo'
}

uploaded = get_file_list_from_folder(drive_api, folder_id)
#print(uploaded)
uploaded = [d['name'].split('.')[0] for d in uploaded]
duplicates = pd.Series(uploaded).value_counts()
uploaded = set(uploaded)
#print(duplicates)
#print(f'files already uploaded {uploaded}')

all_to_download = set(trailers['youtube_key'])
#print(f'files already uploaded & requested for upload{uploaded & all_to_download}')
remaining_trailers = all_to_download - uploaded
print(f'remaining trailers (len({len(remaining_trailers)})) {remaining_trailers}')

In [None]:
with YoutubeDL(ydl_opts) as ydl:
    for t in remaining_trailers:
        try:
            # Helpful Constants
            yt_link = yt_url_prefix + t

            # Download Video
            print(f"Downloading {t}")
            trailer_yt_info = ydl.extract_info(yt_link, download=True)
            
            # Determine File Name
            file_name = [f for f in os.listdir(output_folder) if t in f][0]
            print(file_name)

            # Upload to Drive
            print(f"Uploading file {file_name}...")
            body = {'name': file_name, 'parents': [folder_id]}
            media = MediaFileUpload(output_folder + file_name)
            fiahl = drive_api.files().create(body=body, media_body=media).execute()
            print(f"Created file '{fiahl.get('name')}' id '{fiahl.get('id')}'.")

            # Delete video from hard drive
            os.remove(output_folder + file_name)
            print(f"Removed {file_name}")
            print("Success!")
        except Exception as e:
            print(f"Failed with exception")
            print(e)
        print()