## Before coding

Create a new project

https://console.developers.google.com/projectcreate

Once you have created the project, enable access to the YouTube Data API

https://console.developers.google.com/apis/library

Once enabled, it is important that you get credentials for your project

https://console.developers.google.com/apis/credentials/wizard?api=youtube.googleapis.com

From the options select:  

| Option  | Value |
| ------------- | ------------- |
| ¿Qué API estás usando?  | **YouTube Data API v3**  |
| ¿Desde dónde llamarás a la API? | **Servidor Web**  |
| ¿A qué tipo de datos accederás? | **Datos públicos**  |  

Having selected such values, press: **"¿Qué credenciales necesito?"**  and you will be given an alphanumeric string that is your API key, place this value into the `api_key` variable:

In [None]:
api_key = ""

## Now, coding

Import the necessary packages

In [None]:
import requests
import json
import urllib
import bleach
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from slugify import slugify
from pytube import YouTube

In [None]:
search_url = "https://www.googleapis.com/youtube/v3/search?"
caption_url = "https://www.youtube.com/api/timedtext?"

In [None]:
language_preferences = ['es-MX','es']
channels = {
    'Gobierno de la República' : 'UCfBRIj1Tq8k7-SD8PPVPJpQ',
    'Enrique Peña Nieto': 'UC4hlUPhJxEvAwW-yli8kHpw',
    'Andrés Manuel López Obrador': 'UCxEgOKuI-n-WOJaNcisHvSg',
}
starting_channel = 'UCfBRIj1Tq8k7-SD8PPVPJpQ'

In [None]:
parameters = {
    'key': api_key,
    'part': 'snippet',
    'type': 'video',
    'channelId': starting_channel,
    'maxResults': 50,
    'order': 'relevance',
    'q': 'discurso'
}
max_pages = 10
query_string = urlencode(parameters)
print(urlencode(parameters))

In [None]:
videos = {}
count = 0
for channel in channels:
    print("Searching for", channel)
    parameters['channelId'] = channels[channel]
    videos[channel] = []
    if 'pageToken' in parameters:
        del(parameters['pageToken'])
        query_string = urlencode(parameters)
    pages = max_pages
    page_token = 'FIRST TIME!'
    while pages > 0 and len(page_token) > 0:
        qurl = search_url + query_string
        print(qurl)
        r = requests.get(search_url + query_string)
        result = json.loads(r.text)
        try:
            page_token = result["nextPageToken"]
        except:
            page_token = ''
        parameters['pageToken'] = page_token
        pages = pages - 1
        print(len(result['items']), page_token)
        videos[channel].extend(result['items'])
        count += len(result['items'])
        query_string = urlencode(parameters)

In [None]:
# Conversion to dataframes
chn = []
ids = []
pub = []
titles = []
for c in channels:
    for v in videos[c]:
        videoId = v['id']['videoId']
        publishedDate = v['snippet']['publishedAt']
        title =  v['snippet']['title']
        chn.append(slugify(c))
        ids.append(videoId)
        pub.append(publishedDate)
        titles.append(title)
initial_df = pd.DataFrame({
    'channel':chn,
    'id': ids,
    'published_at': pub,
    'title': titles
})
initial_df['published_at'] = pd.to_datetime(initial_df['published_at'])
initial_df.to_csv("youtube-captions/initial.csv")
print(initial_df.info())

In [None]:
ids = list(initial_df['id'].values)
categories = []
default_language = []
durations = []
license = []
viewCounts = []
likeCounts = []
dislikeCounts = []
favoriteCounts = []
commentCounts = []

batch_size = 50
i = 0
video_details = "https://www.googleapis.com/youtube/v3/videos?id=%s&part=snippet,statistics,contentDetails&key=%s" 
while i < len(ids):
    ids_to_query = ','.join(ids[i:i+batch_size])
    q = video_details % (ids_to_query, api_key)
    r = requests.get(q)
    resultlist = json.loads(r.text)
    for result in resultlist['items']:
        snippet = result['snippet']
        contentDetails = result['contentDetails']
        statistics = result['statistics']

        categories.append(snippet['categoryId'])
        if 'defaultAudioLanguage' in snippet:
            default_language.append(snippet['defaultAudioLanguage'])
        else:
            default_language.append('-')
        durations.append(contentDetails['duration'])
        license.append(contentDetails['licensedContent'])
        viewCounts.append(statistics['viewCount'])
        favoriteCounts.append(statistics['favoriteCount'])
        likeCount = -1
        dislikeCount = -1
        commentCount = -1
        if 'likeCount' in statistics:
            likeCount = int(statistics['likeCount'])
            dislikeCount = int(statistics['dislikeCount'])
        if 'commentCount' in statistics:
            commentCount = int(statistics['commentCount'])
        likeCounts.append(likeCount)
        dislikeCounts.append(dislikeCount)
        commentCounts.append(commentCount)
    
    i += batch_size

details_df = pd.DataFrame({
    'id': ids,
    'category':categories,
    'language': default_language,
    'duration': durations,
    'license': license,
    'views': viewCounts,
    'likes': likeCounts,
    'dislikes': dislikeCounts,
    'favs': favoriteCounts,
    'comments': commentCounts
})

details_df.to_csv("youtube-captions/details.csv")
print(details_df.info())

In [None]:
vids_subs = {}
errors = []
base_dir = 'youtube-captions'
import os
from os.path import join

for channel in channels:
    vids_subs[channel] = []
    for video in videos[channel]:
        videoId = video['id']['videoId']
        title = video['snippet']['title']
        subtitles = ''
        i = 0
        try:
            yt = YouTube('https://www.youtube.com/watch?v=' + videoId)
            while len(subtitles) == 0 and i < len(language_preferences):
                lang = language_preferences[i]
                if yt.captions.get_by_language_code(lang) is not None:
                    subtitles = yt.captions.get_by_language_code(lang).xml_captions
                i = i + 1
        except:
            errors.append(videoId)
        if len(subtitles) > 0:
            vids_subs[channel].append({'id': videoId, 'title': title, 'captions': subtitles })

    ## Getting subs & cleaning them
    for subs in vids_subs[channel]:
        soup = BeautifulSoup(subs['captions'], "lxml")
        texts = soup.find_all('text')
        sub_entries = []
        for text in texts:
            sub_entry = {
                'duration': text.get('dur'),
                'start': text.get('start'),
                'content': BeautifulSoup(text.get_text(), "lxml").text
            }
            sub_entries.append(sub_entry)
        del(subs['captions'])
        subs['captions_parsed'] = sub_entries
        
    ## Now saving the good stuff
    directory = join(base_dir, slugify(channel))
    if not os.path.exists(directory):
        os.makedirs(directory)
    print("Saving to", directory)
    for vid in vids_subs[channel]:
        file_path = join(directory, slugify(vid['title']) + '.json')
        with open(file_path, 'w') as outfile:
            json.dump(vid, outfile, indent=4)
    del(vids_subs[channel])
print("Done!")

In [None]:
cleaning_subs = """directories = ['amlo', 'presidencia', 'epn']
for d in directories:
    directory = join('youtube-captions', d)
    for file in os.listdir(directory):
        if file.endswith("json"):
            video = None
            file1 = join(directory, file)
            with open(file1, 'r') as captions_file:
                video = json.load(captions_file)
                captions = video['captions_parsed']
                for cap in captions:
                    try:
                        cap['content'] = BeautifulSoup(cap['content'], "lxml").get_text()
                    except:
                        print("Error", file1)
            with open(join(directory, file), 'w') as captions_file:
                json.dump(video, captions_file)
print("Done!")
"""