### YouTube API Data Extraction with Python
### Author: Tyler Blair

This script will pull data from YouTube using their APIs. To do this, you will have to set up API credentials with Google, which can be easily done at console.developers.google.com

Additionally, this script pulls the YouTube API key from your system's environmental variables. If you are unfamiliar with how to do this, I have included the steps in the README.md file on my GitHub (tblair7)

In [30]:
import os
import requests
import json
import datetime
from datetime import datetime, time, timedelta, tzinfo
import pandas as pd
import numpy as np
import re
import google.oauth2.credentials
import google_auth_oauthlib.flow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow

##### you'll need to have already set your API key as an environmental ######
##### variable before this point. If you haven't/don't want to do so   ######
##### you can simply set it explicitly here:
# api_key = ['your key']
api_key = os.environ.get('YT_API_KEY')

################### only parameters you should need to set ###################

playlistId = 'PLJpYtEF3No5Nc2264c7hNJehKSMf9Mcni' # ID of the whatever api type you're utilizing
playlistIdentifier = '2018sept' # identifier for saving purposes
maxResults = 0 # 0-250, though I've set 0 to mean no maximum so I can use it for longer playlists

##############################################################################

# documentation of parameters you are able to use for playlistItems
# https://developers.google.com/youtube/v3/docs/playlistItems#properties

api_params_playlist = 'snippet, contentDetails' # e.g., 'id, contentDetails, statistics' as a string
api_params_videos = "id, contentDetails, statistics, snippet" # parameters I wish to retrieve from the playlist

# these are the column headers that are selected from the playlistItems df
params_playlist = 'videoId', 'videoPublishedAt', 'publishedAt', 'title'
params_playlist_rename = 'id', 'dateUploaded', 'dateFound', 'title'

# these are the column headers that are selected from the videos dataframe
params_videos = 'id','channelId','viewCount','likeCount','dislikeCount', 'duration'
params_videos_rename = 'id', 'channelID','views', 'likes', 'dislikes', 'duration_secs'


################################## Constants #################################

times = ['H','M','S']
s_conv = [3600,60,1]
data_playlist = pd.DataFrame([])

################################## Functions ##################################

def gen_params_playlist(playlistId, api_key, api_params_playlist):
    playlist_parameters = {"playlistId": playlistId,
                           "key": api_key,
                           "part": api_params_playlist}
    return playlist_parameters

def gen_params_playlist_token(playlistId, api_key, api_params_playlist,token):
    playlist_parameters = {"playlistId": playlistId,
                           "key": api_key,
                           "part": api_params_playlist,
                           "pageToken": token}
    return playlist_parameters
            
def gen_params_videos(ID, api_key, api_params_videos):
    videos_parameters = {"id": ID,
                         "key": api_key,
                         "part": api_params_videos}
    return videos_parameters


def pull_playlist_data(playlist_parameters):
    url = "https://www.googleapis.com/youtube/v3/playlistItems"
    page = requests.get(url = url,
                        params = playlist_parameters)
    playlist_results = json.loads(page.text)
    df = pd.io.json.json_normalize(playlist_results['items'])
    df.columns = df.columns.map(lambda x: x.split('.')[-1])
    df = df.loc[:, df.columns.isin(list(params_playlist))]
    df_playlist = df.T.drop_duplicates(keep='first').T
    return df_playlist, playlist_results # playlist_results, page.text)


def pull_videos_data(videos_parameters):
    url = "https://www.googleapis.com/youtube/v3/videos"
    page = requests.get(url = url,
                        params = videos_parameters)
    videos_results = json.loads(page.text)
    df_videos = pd.io.json.json_normalize(videos_results['items'])
    df_videos.columns = df_videos.columns.map(lambda x: x.split('.')[-1])
    return df_videos # j_results)


def song_length(duration,times,s_conv):
    song_time = 0
    
    for i in range(len(times)):
        my_regex = r'(\d.?' + times[i] + ')'
        pattern = re.search(my_regex, duration)
        if pattern:
            span_start = pattern.span(1)[0]
            span_end = pattern.span(1)[1]
            value = duration[span_start:span_end-1]
            #print(c[span_start:span_end])
            song_time = song_time + int(value)*s_conv[i]        
        else:
            None       
    return song_time

def date_parse(date):
    date = date[0:10] + '-' + date[11:19]
    date_conv = datetime.strptime(date, '%Y-%m-%d-%H:%M:%S')# T %H%M%S.%f Z')
    return(date_conv)


def time_diff_days(time1, time2):
    timeDelta = (time1 - time2)
    a = re.search(r'\d*? ', str(timeDelta))
    if a:
        days_span_start = a.span(0)[0]
        days_span_end = a.span(0)[1]
        days = int(str(timeDelta)[days_span_start:days_span_end])
    else:
        days = 0
    return(days)

############################### Playlist API Calls ###############################

playlist_parameters = gen_params_playlist(playlistId, api_key, api_params_playlist)

if maxResults == 0:
    print('No maximum number of results returned')
else:
    playlist_parameters.update(dict(maxResults = maxResults))

[df_playlist, playlist_results] = pull_playlist_data(playlist_parameters)
data_playlist = data_playlist.append(df_playlist, sort=False)

while 'nextPageToken' in playlist_results:
    playlist_parameters = gen_params_playlist_token(playlistId, api_key, api_params_playlist,playlist_results['nextPageToken'])
    [df_playlist, playlist_results] = pull_playlist_data(playlist_parameters)
    data_playlist = data_playlist.append(df_playlist, sort=False)

data_playlist.columns = list(params_playlist_rename)
data_playlist = data_playlist.dropna(how='any')

length = len(data_playlist)

for i in range(length):
    data_playlist.dateUploaded.iloc[i] = date_parse(data_playlist.dateUploaded.iloc[i])
    data_playlist.dateFound.iloc[i] = date_parse(data_playlist.dateFound.iloc[i])
    data_playlist.discoveryTime = (data_playlist.dateFound.iloc[i] - data_playlist.dateUploaded.iloc[i])

#################  Videos API requests and data manipulation #################

data_videos_full = pd.DataFrame([])

for j in range(length):
    parameters_vids = gen_params_videos(data_playlist.id.iloc[j], api_key, api_params_videos)
    df = pull_videos_data(parameters_vids)
    
    if df.empty:
        None
    else:
        data_videos = df.loc[:, df.columns.isin(list(params_videos))]
        data_videos = data_videos.T.drop_duplicates(keep='first').T
        data_videos.duration = song_length(str(data_videos.duration),times,s_conv)
        data_videos_full = data_videos_full.append(data_videos, sort=False)

data_videos_full = data_videos_full[['id','channelId','viewCount','likeCount','dislikeCount','duration']]
data_videos_full.columns = list(params_videos_rename)

full_data = pd.merge(data_playlist,data_videos_full, on='id', how='outer')
full_data = full_data.dropna(how='any')

full_data = full_data[['id','title','channelID','views','likes','dislikes','duration_secs','dateUploaded','dateFound']]
full_data = full_data.reindex(columns = np.append(full_data.columns.values,'discoveryTime_days'))

length = len(full_data)
days = np.zeros(length,dtype=int)

for i in range(length):
    days[i] = time_diff_days(data_playlist.dateFound.iloc[i], data_playlist.dateUploaded.iloc[i])

full_data.discoveryTime_days = days

print(full_data)

# saves the data_playlist structure as a .csv with a name dictated by the playlistIdentifier variable and the time
time = datetime.now().strftime('_%Y_%m_%d')
name = playlistIdentifier + time
f = open('%s.csv' % name, 'w')
full_data.to_csv(f.name)




No maximum number of results returned
              id                                              title  \
0    In9xziHA5MM              Broods - Eyes A Mess (Official Audio)   
1    ATmNdOC00qc                 Gamper & Dadoni - Here To Love You   
2    MxKCEr8d5J4    Nevada & Loote - Don't Call Me (Official Audio)   
3    xsJLachhs-o  Rudimental & Major Lazer - Let Me Live (feat. ...   
4    _D1rrdFcj1U  Matoma & Enrique Iglesias – I Don't Dance (Wit...   
5    Uh-lxL34DZo  Lost Kings - Stuck (Official Video) ft. Tove S...   
6    0rH3oPVvsoM                  Hoodboi - Glide (ft. Tkay Maidza)   
7    yRrstcJKEis         Lemon - Your Friends (Dimond Saints Remix)   
8    k9DVIEgiP6o           Dimond Saints - Innocence (ft. Yaarrohs)   
9    pcNuIQapEu8                                      Ayelle - Talk   
10   4MlQw_YcKfA                  Alex Aiono - Big Mistake (Lyrics)   
11   H15umhRDY8E           Anna Clendening - Boys Like You (Lyrics)   
12   je3cp9yKQsE                   ILIR