In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import requests
import json
import urllib
import isodate
import os
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from slugify import slugify
from pytube import YouTube

if not os.path.exists("casey-neistat-analisys"):
    os.makedirs("casey-neistat-analisys")

In [None]:
api_key = "" # Place your YT api key here
assert api_key != ""
channel_id = 'UCtinbF-Q-fVthA0qrFQTgXQ'

playlists_parameters = {
    'part': 'contentDetails',
    'id': channel_id,
    'key': api_key
}

categories_parameters = {
    'part': 'snippet',
    'regionCode': 'US',
    'key': api_key
}

parameters = {
    'key': api_key,
    'part': 'snippet',
    'type': 'video',
    'channelId': channel_id,
    'maxResults': 50,
    'order': 'date'
}
max_pages = 100
query_string = urlencode(parameters)

In [None]:
get_categories_url = "https://www.googleapis.com/youtube/v3/videoCategories?" + urlencode(categories_parameters)
r = requests.get(get_categories_url)
result = json.loads(r.text)
categoryId = []
categoryNames = []
for category in result['items']:
    categoryId.append(int(category['id']))
    categoryNames.append(category['snippet']['title'])
categories_df = pd.DataFrame({'category': categoryId, 'name': categoryNames})
categories_df.head()
categories_df.to_csv("casey-neistat-analisys/categories_US.csv", encoding='utf-8')

In [None]:
get_playlists_url = "https://www.googleapis.com/youtube/v3/channels?" + urlencode(playlists_parameters)
r = requests.get(get_playlists_url)
result = json.loads(r.text)

playlist_id = result['items'][0]['contentDetails']['relatedPlaylists']['uploads']
print(playlist_id)

In [None]:
count = 0
videos = []
search_url = "https://www.googleapis.com/youtube/v3/playlistItems?"
parameters['playlistId'] = playlist_id
query_string = urlencode(parameters)
pages = max_pages
page_token = 'FIRST TIME!'
while pages > 0 and len(page_token) > 0:
    qurl = search_url + query_string
    r = requests.get(search_url + query_string)
    result = json.loads(r.text)
    try:
        page_token = result["nextPageToken"]
    except:
        page_token = ''
    parameters['pageToken'] = page_token
    pages = pages - 1
    videos.extend(result['items'])
    count += len(result['items'])
    query_string = urlencode(parameters)
print("Done, found", count)

In [None]:
# Conversion to dataframes
ids = []
pub = []
titles = []
for v in videos:
    videoId = v['snippet']['resourceId']['videoId']
    #print(json.dumps(v))
    publishedDate = v['snippet']['publishedAt']
    title =  v['snippet']['title']
    ids.append(videoId)
    pub.append(publishedDate)
    titles.append(title)
initial_df = pd.DataFrame({
    'id': ids,
    'published_at': pub,
    'title': titles
})
initial_df['published_at'] = pd.to_datetime(initial_df['published_at'])
initial_df.to_csv("casey-neistat-analisys/casey_initial.csv", encoding='utf-8')
print(initial_df.info())

In [None]:
ids = list(initial_df['id'].values)
categories = []
default_language = []
durations = []
license = []
viewCounts = []
likeCounts = []
dislikeCounts = []
favoriteCounts = []
commentCounts = []
a=True
batch_size = 50
i = 0
video_details = "https://www.googleapis.com/youtube/v3/videos?id=%s&part=snippet,statistics,contentDetails&key=%s" 
while i < len(ids):
    ids_to_query = ','.join(ids[i:i+batch_size])
    q = video_details % (ids_to_query, api_key)
    r = requests.get(q)
    resultlist = json.loads(r.text)
    for result in resultlist['items']:
        snippet = result['snippet']
        contentDetails = result['contentDetails']
        statistics = result['statistics']

        categories.append(snippet['categoryId'])
        if 'defaultAudioLanguage' in snippet:
            default_language.append(snippet['defaultAudioLanguage'])
        else:
            default_language.append('-')
        durations.append(contentDetails['duration'])
        license.append(contentDetails['licensedContent'])
        viewCounts.append(statistics['viewCount'])
        favoriteCounts.append(statistics['favoriteCount'])
        likeCount = -1
        dislikeCount = -1
        commentCount = -1
        if 'likeCount' in statistics:
            likeCount = int(statistics['likeCount'])
            dislikeCount = int(statistics['dislikeCount'])
        if 'commentCount' in statistics:
            commentCount = int(statistics['commentCount'])
        likeCounts.append(likeCount)
        dislikeCounts.append(dislikeCount)
        commentCounts.append(commentCount)
    
    i += batch_size

details_df = pd.DataFrame({
    'id': ids,
    'category':categories,
    'language': default_language,
    'duration': durations,
    'license': license,
    'views': viewCounts,
    'likes': likeCounts,
    'dislikes': dislikeCounts,
    'favs': favoriteCounts,
    'comments': commentCounts
})

details_df.to_csv("casey-neistat-analisys/casey_detailed.csv", encoding='utf-8')
print(details_df.info())

In [None]:
initial_df = pd.read_csv("casey-neistat-analisys/casey_initial.csv", index_col=0, 
                         parse_dates=['published_at'], na_values=[-1, ''])
details_df = pd.read_csv("casey-neistat-analisys/casey_detailed.csv", index_col=0, na_values=[-1, ''])


initial_df = initial_df.drop_duplicates()
details_df = details_df.drop_duplicates()
details_df.duration = details_df.duration.apply(lambda iso: isodate.parse_duration(iso).total_seconds())

complete_df = pd.merge(left=initial_df, right=details_df, on='id')
complete_df.fillna(-1)
complete_df.set_index('published_at', inplace=True)

print(complete_df.tail())
complete_df.to_csv("casey-neistat-analisys/casey_complete.csv", encoding='utf-8')

In [None]:
complete_df = pd.read_csv("casey-neistat-analisys/casey_complete.csv", parse_dates=['published_at'], index_col=0)
complete_df = complete_df.tz_localize('UTC').tz_convert('US/Pacific')
complete_df.head(10)

In [None]:
# When he didnt uploaded a vlog:
vlog_start,vlog_end = '2015-03-24', '2016-11-19'
daily_vlog_count = complete_df.loc[vlog_start:vlog_end,['views']].resample('D').count()
daily_vlog_count.columns = ['videos']
print(daily_vlog_count[daily_vlog_count['videos'] == 0])
print(daily_vlog_count['videos']['2015'])