In [17]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

import urllib.parse as p
import re
import os
import pickle
from google.auth.exceptions import RefreshError
import logging
import pandas as pd 

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]

In [18]:
def youtube_authenticate():
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = r'D:\2A\2A\mods207\code_secret_client.json'
    creds = None
    # the file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first time
    if os.path.exists("token.pickle"):
        with open("token.pickle", "rb") as token:
            creds = pickle.load(token)
    # if there are no (valid) credentials availablle, let the user log in.
    if creds and creds.refresh_token:
        try:
            creds.refresh(Request())
        except RefreshError:
            logger.error("Credentials could not be refreshed, possibly the authorization was revoked by the user.")
            os.unlink('token.pickle')
            return
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            client_secrets_file, SCOPES)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)
            
            


    return build(api_service_name, api_version, credentials=creds)

# authenticate to YouTube API
youtube = youtube_authenticate()

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=649716511232-ntnmr9balna9gdt1cu9fo741lurh9lh7.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A52167%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=JaOVbtGWC1ayGDkQB4HdDqTqidlhP6&access_type=offline


In [8]:
def get_video_id_by_url(url):
    """
    Return the Video ID from the video `url`
    """
    # split URL parts
    parsed_url = p.urlparse(url)
    # get the video ID by parsing the query of the URL
    video_id = p.parse_qs(parsed_url.query).get("v")
    if video_id:
        return video_id[0]
    else:
        raise Exception(f"Wasn't able to parse video URL: {url}")

# Comments

In [10]:
def get_comments(youtube, **kwargs):
    return youtube.commentThreads().list(
        part="snippet",
        **kwargs
    ).execute()

In [24]:
# URL can be a channel or a video, to extract comments
url = "https://www.youtube.com/watch?v=CpZBJZYJKFE&ab_channel=Cyprien"

def get_channel_videos_comments(path, channel):
    
    channel_csv = pd.read_csv(path + channel)
    channel_csv.drop_duplicates(inplace=True)  # drop ducplicates 
    channel_csv.dropna(inplace=True)  # drop nan
    videos = channel_csv['videos'].apply(lambda x : str(x)) # all video urls are an str type    
    df = pd.DataFrame() 
    for url in videos : 
        if "watch" in url:
            # that's a video
            video_id = get_video_id_by_url(url)
            print(video_id, 'looking for comments')
            params = {
                'videoId': video_id, 
                'maxResults': 10000,
                'order': 'relevance', # default is 'time' (newest)
            }
        else:
            continue
            
        # get the first 100 pages (100 API requests)
        n_pages = 100
        for i in range(n_pages):
            # make API call to get all comments from the channel (including posts & videos)
            response = get_comments(youtube, **params)
            items = response.get("items")
            # if items is empty, breakout of the loop
            if not items:
                break
            for item in items:
                comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                updated_at = item["snippet"]["topLevelComment"]["snippet"]["updatedAt"]
                like_count = item["snippet"]["topLevelComment"]["snippet"]["likeCount"]
                comment_id = item["snippet"]["topLevelComment"]["id"]
                
                df = df.append({"ID" : comment_id, "Comment": comment, "Likes" : like_count,
                                "Updated At" : updated_at} ,   ignore_index=True)
                
            if "nextPageToken" in response:
                # if there is a next page
                # add next page token to the params we pass to the function
                params["pageToken"] =  response["nextPageToken"]
            else:
                # must be end of comments!!!!
                break
                
        print(df.shape[0], " nb of comments")
    return df  

In [15]:
path = 'channelVideos/'
all_videos = os.listdir(path)

In [25]:
channel = "channel_1_UCyWqModMQlbIo8274Wh_ZsQ.csv"
get_channel_videos_comments(path, channel)

CpZBJZYJKFE looking for comments
2000  nb of comments
G_h2c0igr1M looking for comments
4000  nb of comments
ZdIlTXH9ksY looking for comments
6000  nb of comments
JUrGRUW2iUo looking for comments
7117  nb of comments
hJ8SZJ5bGEY looking for comments
9117  nb of comments
xqcCxOsqE7I looking for comments
11117  nb of comments
MMA0QF7U1RI looking for comments
13117  nb of comments
NySFCwVYlCk looking for comments
15117  nb of comments
ebdEnpFgXdI looking for comments
17117  nb of comments
thLzv43YoI8 looking for comments
19117  nb of comments
kMf41XfU45A looking for comments
20837  nb of comments
rQSqxxCJkwI looking for comments
22837  nb of comments
dMMo4QaoVZo looking for comments
24837  nb of comments
iHLT7_WaN-A looking for comments
26837  nb of comments
LCrrT8fzCJE looking for comments
28837  nb of comments
O_KjfyB-bMs looking for comments


KeyboardInterrupt: 