In [686]:
################### YouTube API Data Extraction with Python ###################
############################# Author: Tyler Blair #############################

# This script will pull data from YouTube using their APIs. To do this,
# you will have to set up API credentials with Google, which can be easily
# done at console.developers.google.com

# Additionally, this script pulls the YouTube API key from your system's
# environmental variables. If you are unfamiliar with how to do this, I
# have included the steps in the README.md file on my GitHub (tblair7)

import os

import requests
import json
import datetime
from datetime import datetime, time, timedelta, tzinfo
import pandas as pd
import numpy as np


import google.oauth2.credentials

import google_auth_oauthlib.flow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow

##### you'll need to have already set your API key as an environmental ######
##### variable before this point. If you haven't/don't want to do so   ######
##### you can simply set it explicitly here:
# api_key = ['your key']
api_key = os.environ.get('YT_API_KEY')

################### only parameters you should need to set ###################
# documentation of parameters you are able to use for playlistItems
# https://developers.google.com/youtube/v3/docs/playlistItems#properties

playlistId = 'PLJpYtEF3No5Nq0GqRHb1-HYvFYwXwk8JM' # ID of the whatever api type you're utilizing
playlistIdentifier = 'Test' # identifier for saving purposes
maxResults = 3 # 0-250, though I've set 0 to mean no maximum so I can use it for my playlist

api_params_playlist = 'snippet, contentDetails' # e.g., 'id, contentDetails, statistics' as a string
api_params_videos = "id, contentDetails, statistics, snippet" # parameters I wish to retrieve from my playlist in the end

# these are the column headers that are selected from the playlistItems df
params_playlist = 'videoId', 'videoPublishedAt', 'publishedAt', 'title'
params_playlist_rename = 'id', 'dateUploaded', 'dateFound', 'title'

# these are the column headers that are selected from the videos dataframe
params_videos = 'id','channelId','viewCount','likeCount','dislikeCount', 'duration'
params_videos_rename = 'id', 'channelID','views', 'likes', 'dislikes', 'duration_secs'


################################## Constants #################################


url_playlist = "https://www.googleapis.com/youtube/v3/playlistItems"
url_videos = "https://www.googleapis.com/youtube/v3/videos"

times = ['H','M','S']
s_conv = [3600,60,1]

################################## Functions ##################################

def gen_params(ID, api_params_videos, api_key):
    parameters = {"part": api_params_videos,
                  "id": ID,
                  "key": api_key}
    return parameters

def pull_YT_data(url, parameters):
    page = requests.get(url = url_videos,
                        params = parameters)
    j_results = json.loads(page.text)
    df = pd.io.json.json_normalize(j_results['items'])
    df.columns = df.columns.map(lambda x: x.split('.')[-1])
    return df


def song_length(duration,times,s_conv):
    song_time = 0
    
    for i in range(len(times)):
        my_regex = r'(\d.?' + times[i] + ')'
        pattern = re.search(my_regex, duration)
        if pattern:
            span_start = pattern.span(1)[0]
            span_end = pattern.span(1)[1]
            value = duration[span_start:span_end-1]
            #print(c[span_start:span_end])
            song_time = song_time + int(value)*s_conv[i]        
        else:
            None       
    return song_time

def date_parse(date):
    date = date[0:9] + '-' + date[11:18]
    date_conv = datetime.strptime(date, '%Y-%m-%d-%H:%M:%S')# T %H%M%S.%f Z')
    #dateee = datetime.strptime(a[11:18],'%H:%M:%S')
    return(date_conv)


def time_diff_days(time1, time2):
    timeDelta = (time1 - time2)
    a = re.search(r'\d*? ', str(timeDelta))
    days_span_start = a.span(0)[0]
    days_span_end = a.span(0)[1]
    days = int(str(timeDelta)[days_span_start:days_span_end])
    return(days)

############################### Playlist API Calls ###############################

# sets the parameters for the API request
parameters = {"part": api_params_playlist,
              "playlistId": playlistId,
              "key": api_key}

if maxResults == 0:
    print('No maximum number of results returned')
else:
    parameters.update(dict(maxResults = maxResults))



# pulls the data from YT and puts it in a usable format
page = requests.get(url = url_playlist,
                    params = parameters) # pulls the data
j_results = json.loads(page.text) # make somewhat readable
df = pd.io.json.json_normalize(j_results['items']) # formatted table, lots of redundant info
df.columns = df.columns.map(lambda x: x.split('.')[-1])

# truncates the data based on the params_playlist input from the beginning
data_playlist = df.loc[:, df.columns.isin(list(params_playlist))]
data_playlist = data_playlist.T.drop_duplicates(keep='first').T # drop_duplicates works on rows, so transpose, select row, transpose back
data_playlist.columns = list(params_playlist_rename) # assigns column names


for i in range(length):
    data_playlist.dateUploaded.iloc[i] = date_parse(data_playlist.dateUploaded.iloc[i])
    data_playlist.dateFound.iloc[i] = date_parse(data_playlist.dateFound.iloc[i])
    data_playlist.discoveryTime = (data_playlist.dateFound.iloc[i] - data_playlist.dateUploaded.iloc[i])

#################  Videos API requests and data manipulation #################


#length = np.arange(data_playlist.shape[0])
length = len(data_playlist)

data_videos_full = pd.DataFrame([])

for i in range(length):
    parameters_vids = gen_params(data_playlist.id[i], api_params_videos, api_key)
    df = pull_YT_data(url_videos,parameters_vids)
    data_videos = df.loc[:, df.columns.isin(list(params_videos))]
    data_videos = data_videos.T.drop_duplicates(keep='first').T
    #data_videos.duration = song_length(data_videos.duration,times,s_conv)
    data_videos_full = data_videos_full.append(data_videos)
    data_videos_full.duration.iloc[i] = song_length(data_videos_full.duration.iloc[i],times,s_conv)

data_videos_full = data_videos_full[['id','channelId','viewCount','likeCount','dislikeCount','duration']]
data_videos_full.columns = list(params_videos_rename)

full_data = pd.merge(data_playlist,data_videos_full, on='id', how='outer')
full_data = full_data[['id','title','channelID','views','likes','dislikes','duration_secs','dateUploaded','dateFound']]
full_data = full_data.reindex(columns = np.append(full_data.columns.values,'discoveryTime_days'))

days = np.zeros(length,dtype=int)

for i in range(length):
    days[i] = time_diff_days(data_playlist.dateFound.iloc[i], data_playlist.dateUploaded.iloc[i])

full_data.discoveryTime_days = days

# saves the data_playlist structure as a .csv with a name dictated by the playlistIdentifier variable and the time
time = datetime.now().strftime('_%Y_%m_%d')
name = playlistIdentifier + time
f = open('%s.csv' % name, 'w')
full_data.to_csv(f.name)







In [679]:
full_data_2 = full_data[['id','title','channelID','views','likes','dislikes','duration(s)','dateUploaded','dateFound','discoveryTime']]
print(full_data_2)

            id                                              title  \
0  _9IBbMW2o_o  Deadmau5 - 4x4=12 (Continuous Mix) (FULL 1 Hou...   
1  Ckk8zROo868                 SHY Martin - Lose You Too (lyrics)   
2  c4M36N5VXwI             20 minutes Countdown Timer Alarm Clock   

                  channelID     views   likes dislikes  duration(s)  \
0  UCqvPQst_TF6cI94mcZwNp-A  23132020  115133     4855         4195   
1  UCBYg9_11ErMsFFNR66TRuLA    277238    5356       57          188   
2  UCPyVp0yR4L66EDP3GuF5hJQ     49363     127       41         1200   

          dateUploaded            dateFound  discoveryTime  
0  2010-12-01 13:29:02  2018-08-02 01:33:01         2800.0  
1  2018-07-02 19:55:04  2018-08-02 01:33:04           30.0  
2  2015-12-01 23:41:00  2018-08-02 01:34:04          974.0  


In [644]:
a = re.search(r'\d*? ', str(timeDelta))
days_span_start = a.span(0)[0]
days_span_end = a.span(0)[1]
b = int(str(timeDelta)[days_span_start:days_span_end])


974


In [688]:
print(full_data)

            id                                              title  \
0  _9IBbMW2o_o  Deadmau5 - 4x4=12 (Continuous Mix) (FULL 1 Hou...   
1  Ckk8zROo868                 SHY Martin - Lose You Too (lyrics)   
2  c4M36N5VXwI             20 minutes Countdown Timer Alarm Clock   

                  channelID     views   likes dislikes  duration_secs  \
0  UCqvPQst_TF6cI94mcZwNp-A  23132038  115133     4855           4195   
1  UCBYg9_11ErMsFFNR66TRuLA    277358    5357       57            188   
2  UCPyVp0yR4L66EDP3GuF5hJQ     49364     127       41           1200   

          dateUploaded            dateFound  discoveryTime_days  
0  2010-12-01 13:29:02  2018-08-02 01:33:01                2800  
1  2018-07-02 19:55:04  2018-08-02 01:33:04                  30  
2  2015-12-01 23:41:00  2018-08-02 01:34:04                 974  


In [528]:
times = ['H','M','S']
s_conv = [3600,60,1]

song_time = 0
for i in range(len(times)):
    my_regex = r'(\d.?' + times[i] + ')'
    pattern = re.search(my_regex, c)
    if pattern:
        span_start = pattern.span(1)[0]
        span_end = pattern.span(1)[1]
        value = c[span_start:span_end-1]
        #print(c[span_start:span_end])
        song_time = song_time + int(value)*s_conv[i]
        print(song_time)
        
    else:
        None #print('no value')

180
188


In [687]:
j_results

{'kind': 'youtube#playlistItemListResponse',
 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/9PN_alQcwT3nIKoHVyoes5wU5EY"',
 'pageInfo': {'totalResults': 3, 'resultsPerPage': 3},
 'items': [{'kind': 'youtube#playlistItem',
   'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/CcaFmhDXQYJIS1T-QvbZUnpDjVA"',
   'id': 'UExKcFl0RUYzTm81TnEwR3FSSGIxLUhZdkZZd1h3azhKTS41NkI0NEY2RDEwNTU3Q0M2',
   'snippet': {'publishedAt': '2018-08-27T01:33:19.000Z',
    'channelId': 'UCC6-c2lD7fSObuYk478qtRQ',
    'title': 'Deadmau5 - 4x4=12 (Continuous Mix) (FULL 1 Hour 9 Mins)',
    'description': 'My YouTube - http://www.youtube.com/sv3rige\nBlog - http://www.lifeisabouthavingfun.com\nBlog - http://www.sv3rige.com',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/_9IBbMW2o_o/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/_9IBbMW2o_o/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/_9IBbMW2o_o/hqdefault.jpg'

In [498]:
times = ['H','M','S']
my_regex = r'(.*' + re.escape(times[1]) + r')'
print(my_regex)
hours = re.search(my_regex, pattern[1])
print(hours)

(.*M)
<_sre.SRE_Match object; span=(0, 2), match='9M'>


In [524]:
len(data_playlist)

3