In [None]:
import ndjson
import os
import re
import requests
from apiclient.discovery import build
from dotenv import load_dotenv

load_dotenv()

YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

In [None]:
CHANNEL_ID_LISTS = {
    "mlb": "UCoLrcjPV5PbUrUyXq5mjc_A",
    "fifa": "UCpcTrCXblq78GZrTUTLWeBw",
    'major_league_soccer': 'UCSZbXT5TLLW_i-5W8FZpFsg',
    "laliga":"UCTv-XvfzLX3i4IGWAm4sbmA",
    "serie_A":"UCBJeMCIeLQos7wacox4hmLQ",
    "bundesliga":"UC6UL29enLNe4mqwTfAyeNuw",
    "nba": "UCWJ2lWNubArHWmf3FIHbfcQ",
    "wnba": "UCO9a_ryN_l7DIDS-VIt-zmw",
    "fiba": "UCtInrnU3QbWqFGsdKT1GZtg",
    "march_madness":"UCKjEtnnXEHsXE9IvCb92V7g",
    "nfl": "UCDVYQ4Zhbm3S2dlz7P1GBDg",
    "college_football":"UCzRWWsFjqHk1an4OnVPsl9g"
}


def write_to_ndjson(path, result):
    with open(path, 'a') as f:
        writer = ndjson.writer(f)
        writer.writerow(result)


def is_duration_over_time_threshold(duration, time_threshold):
    numbers = re.findall(r'\d+', duration)
    hours, minutes, seconds = 0, 0, 0
    
    if 'H' in duration:
        hours = int(numbers.pop(0))
    
    if 'M' in duration:
        minutes = int(numbers.pop(0))
    
    if 'S' in duration:
        seconds = int(numbers.pop(0))
    
    total_minutes = hours * 60 + minutes + seconds / 60
    
    return total_minutes >= time_threshold


def get_thumbnails(channel_name, save_path, save_ndjson, published_after, published_before, page_token, time_threshold):
    videos_response = youtube.search().list(
        part="snippet",
        channelId=CHANNEL_ID_LISTS[channel_name],
        maxResults=50,
        order="date",
        publishedAfter=published_after,
        publishedBefore=published_before,
        pageToken=page_token
    ).execute()
    
    print(videos_response["nextPageToken"])
    
    for result in videos_response.get("items", []):
        video_id = result["id"]["videoId"]
        response = youtube.videos().list(
            part="contentDetails",
            id=video_id
        ).execute()
        thumbnail_url = result["snippet"]["thumbnails"]["high"]["url"]
        video_title = result["snippet"]["title"]
        published_time = result["snippet"]["publishedAt"]
        duration = response['items'][0]['contentDetails']['duration']
        file_name = f'{video_id}.jpg'
        response = requests.get(thumbnail_url)
        image = response.content
        
        if is_duration_over_time_threshold(duration, time_threshold):
            # save image
            with open(f'{save_path}/{file_name}', "wb") as f:
                f.write(image)
            video_info = {'channel': channel_name, 'id': video_id, 'title': video_title, 'time': published_time}
            write_to_ndjson(save_ndjson, video_info)

In [None]:
# page_token is initially empty
get_thumbnails(channel_name='nfl', save_path='./american_football', save_ndjson='./american_football.ndjson', published_after="2018-09-01T00:00:00Z", published_before="2018-12-01T00:00:00Z", page_token='', time_threshold=6)