### Imports

In [1]:
from googleapiclient.discovery import build
import pandas as pd
from google.colab import files, drive
import getpass

In [2]:
from google.colab import userdata

## User Input

In [3]:
api_key = userdata.get('YOUTUBE_API')

In [8]:
# Build the YouTube client
youtube = build('youtube', 'v3', developerKey=api_key)

## Get Video IDs for Playlist

In [5]:
from urllib.parse import urlparse, parse_qs

def get_video_id(url):
    parsed_url = urlparse(url)

    # If URL is a standard YouTube link (e.g., https://www.youtube.com/watch?v=VIDEO_ID)
    if parsed_url.hostname in ["www.youtube.com", "youtube.com"]:
        return parse_qs(parsed_url.query).get("v", [None])[0]

    # If URL is a shortened YouTube link (e.g., https://youtu.be/VIDEO_ID)
    elif parsed_url.hostname in ["youtu.be"]:
        return parsed_url.path.lstrip("/")

    return None

# Example Usage
youtube_url = "https://youtu.be/3YytHYb2qqg?si=9ZHDd_5DmN3f534M"
video_id = get_video_id(youtube_url)
print("Extracted Video ID:", video_id)

# Now you can pass video_ids to the next function
# next_function(video_ids)

Extracted Video ID: 3YytHYb2qqg


## Get All Comments

In [43]:
# Function to get replies for a specific comment
def get_replies(youtube, parent_id):
    replies = []
    next_page_token = None

    while True:
        reply_request = youtube.comments().list(
            part="snippet",
            parentId=parent_id,
            textFormat="plainText",
            maxResults=100,
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response.get('items', []):
            comment = item['snippet']
            replies.append({
                'CommentID': item['id'],  # Unique Comment ID
                'ParentID': parent_id,  # Parent Comment ID
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'Comment': comment['textDisplay'],
                'Date': comment.get('updatedAt', comment['publishedAt']),
                'Likes': comment['likeCount']
            })

        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:
            break

    return replies

# Function to get all comments (including replies) for a single video
def get_comments_for_video(youtube, video_id):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100
        )
        comment_response = comment_request.execute()

        for item in comment_response.get('items', []):
            top_comment = item['snippet']['topLevelComment']['snippet']
            comment_id = item['snippet']['topLevelComment']['id']  # Extract comment ID
            all_comments.append({
                'CommentID': comment_id,
                'ParentID': None,  # Top-level comments have no parent
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'Comment': top_comment['textDisplay'],
                'Date': top_comment.get('updatedAt', top_comment['publishedAt']),
                'Likes': top_comment['likeCount']
            })

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(youtube, comment_id))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    return all_comments

# List to hold all comments from all videos
all_comments = []

video_comments = get_comments_for_video(youtube, video_id)
all_comments.extend(video_comments)

# Create DataFrame
comments_df = pd.DataFrame(all_comments)


In [44]:
comments_df.head()

Unnamed: 0,CommentID,ParentID,Timestamp,Username,Comment,Date,Likes
0,Ugz9iIF4cwCpLJd-Jl54AaABAg,,2024-07-15T16:13:38Z,@adityasaha4704,I thank you for my very existence,2024-07-15T16:13:38Z,0
1,UgxnDVp1OB0v5Wf8iD54AaABAg,,2023-09-18T19:14:31Z,@tippydippy6529,every couple of weeks I find myself watching t...,2023-09-18T19:14:30Z,2
2,UgxWtjYys9HNS7ZxRap4AaABAg,,2023-07-31T16:22:14Z,@wubbalubbadubdub777,ayahviv ! oued amizour days a rayan !,2023-07-31T16:22:14Z,0
3,Ugwu8Jy2t39SzRnUjoV4AaABAg,,2023-07-19T23:03:47Z,@angelagrullon2189,La vibes que transmite esta cancion mierda.......,2023-07-19T23:03:47Z,0
4,Ugyw_I56xAhWu9LctcV4AaABAg,,2023-07-17T16:54:37Z,@RamonSBK,Vagabond Reminds me everyday that anything is ...,2023-09-07T12:22:47Z,0


In [45]:
comments_df.columns

Index(['CommentID', 'ParentID', 'Timestamp', 'Username', 'Comment', 'Date',
       'Likes'],
      dtype='object')

In [46]:
comments_df.sort_values(by = "Date", inplace = True, ascending = False)

In [116]:
comments_df.shape

(597, 7)

In [47]:
comments_df.head()

Unnamed: 0,CommentID,ParentID,Timestamp,Username,Comment,Date,Likes
243,UgypwPJSMIyeogONpxl4AaABAg.9U-NgP3TZAA9ZTmUGwGMcT,UgypwPJSMIyeogONpxl4AaABAg,2022-03-12T12:47:01Z,@buckets3628,@Aryan ShimrayYou can take any meaning u want ...,2024-11-14T01:09:51Z,23
568,Ugzac0O_LaZWK1ZtsCd4AaABAg.9PbmDFOo1LGAA9hGl7xMKp,Ugzac0O_LaZWK1ZtsCd4AaABAg,2024-10-29T02:01:04Z,@reidcacaro2919,"@@massivegat5087agreed, the prelude is a maste...",2024-10-29T02:01:04Z,1
567,Ugzac0O_LaZWK1ZtsCd4AaABAg.9PbmDFOo1LGA9uTGinDLPl,Ugzac0O_LaZWK1ZtsCd4AaABAg,2024-10-22T18:42:08Z,@FatGuyEngineer,‚Äã@@massivegat5087 It's Third Arc (Baltic Sea) ...,2024-10-22T18:42:08Z,0
566,Ugzac0O_LaZWK1ZtsCd4AaABAg.9PbmDFOo1LGA9QPw1yJ3EQ,Ugzac0O_LaZWK1ZtsCd4AaABAg,2024-10-10T17:16:30Z,@joeyreash9067,My top 3 stories are 3. Vagabond 2. Berserk an...,2024-10-10T17:16:30Z,0
565,Ugzac0O_LaZWK1ZtsCd4AaABAg.9PbmDFOo1LGA61c1QOZo3B,Ugzac0O_LaZWK1ZtsCd4AaABAg,2024-07-18T12:35:46Z,@MR.X-l2u,@@Goofy_Benjamin11 Im reading that to really good,2024-07-18T12:35:46Z,0


In [118]:
comments_df.ParentID.nunique() # number of comments which are threaded ie replies exist

48

In [92]:
comments_df.shape

(597, 7)

### STRUCTURE THE DATA ACCORDING TO HIERARCHIES
### MAKE NEW DATAFRAME FOR THAT

In [119]:
hdf = pd.DataFrame([], columns = ['CommentID', 'Timestamp', 'Username', 'Comment', 'Date', 'Likes', 'reply_comment_id', 'replies'])

for i, x in enumerate(comments_df.ParentID):
  # if parent id is None add it into a dataframe
  if x == None:
    hdf.loc[len(hdf)] = [ comments_df.iloc[i, 0],  # comment id
                          comments_df.iloc[i, 2],  # timestamp
                          comments_df.iloc[i, 3],  # user
                          comments_df.iloc[i, 4],  # comment text
                          comments_df.iloc[i, 5],  # date
                          comments_df.iloc[i, 6],  # likes
                          "",                      # reply comment id
                          ""                       # replies
                        ]


In [120]:
hdf.sort_values(by = "Likes", ascending = False)

Unnamed: 0,CommentID,Timestamp,Username,Comment,Date,Likes,reply_comment_id,replies
190,Ugzac0O_LaZWK1ZtsCd4AaABAg,2021-07-10T11:23:55Z,@MR.X-l2u,Vegabond and Berserk are the two best manga ev...,2021-07-10T11:23:55Z,2892,,
180,UgzyQhJPJ-zHhKkDN0B4AaABAg,2021-08-30T17:34:52Z,@cheeseman2219,Musashi vs 70 Yoshioka men was the most memora...,2021-08-30T17:34:52Z,1176,,
192,Ugx_LdgI0vWOumWYS_d4AaABAg,2021-07-10T05:49:27Z,@abandonaccount8582,Vegabond is underrated more people should read...,2021-07-10T05:49:27Z,1158,,
161,UgypwPJSMIyeogONpxl4AaABAg,2021-10-27T10:47:26Z,@DavionX13,"Preoccupied with a single leaf, you won't see ...",2021-10-27T10:47:26Z,928,,
186,UgxMNe48oT4QElg2klN4AaABAg,2021-07-21T01:06:14Z,@sovereignrepublicofcopticx6397,The best part is that he was real.,2021-07-21T01:06:14Z,913,,
...,...,...,...,...,...,...,...,...
55,Ugydq8sQ0J5stCWajt94AaABAg,2021-12-19T13:57:22Z,@Ryan.jorgeBR,Nice üëè,2021-12-19T13:57:22Z,0,,
111,Ugydsz9JN6mW9hz9g8R4AaABAg,2021-11-12T16:35:09Z,@masterofdeath5498,–û—á–µ–Ω—å –∂–∞–ª—å —á—Ç–æ –Ω–µ –∑–∞–∫–æ–Ω—á–∏–ª–∏ –µ—ë.,2021-11-12T16:35:09Z,0,,
110,UgzGTHZUQL4XS12M7Zl4AaABAg,2021-11-12T18:39:27Z,@TeeRkee,Anime when?,2021-11-12T18:39:35Z,0,,
56,Ugz5i0vxPJAIu-Em5Vh4AaABAg,2021-12-18T12:46:19Z,@jaid2salvia961,This amv insipired me to watch vagabond . Ty bro,2021-12-18T12:46:19Z,0,,


## ADD REPLIES TO PARENT COMMENTS, NO NEED FOR SAVING HIERARCHIES, WE JUST NEED THE TEXT.


In [121]:
#hdf = pd.DataFrame([], columns = ['CommentID', 'Timestamp', 'Username', 'Comment', 'Date', 'Likes', 'reply_comment_id', 'replies'])
pids = list(hdf.CommentID)
iop = 0

for i, x in enumerate(comments_df.ParentID):
  if x:
    subids = [x.split(" || ") for x in hdf.reply_comment_id]
    #print(subids)
    if x in pids:
      iop +=1
      # hdfs index, get index of parent comment id from its df
      id = hdf[hdf.CommentID == x].index
      # on that index add our information
      hdf.loc[id, "reply_comment_id"] +=  " || " + comments_df.loc[i, "CommentID"]
      hdf.loc[id, "replies"] += " || " + comments_df.loc[i, "Comment"]
      #print(hdf.loc[id, "reply_comment_id"])

    else:
      print("here")
      for k, subcs in enumerate(subids):
        if x in subcs:
          hdf.loc[k, "reply_comment_id"] +=  " || " + comments_df.loc[i, "CommentID"]
          hdf.loc[k, "replies"] += " || " + comments_df.loc[i, "Comment"]

In [122]:
hdf.shape

(196, 8)

In [123]:
f"there are {len(hdf)} independent comments"

'there are 196 independent comments'

### Output to CSV

In [125]:
# Export whole dataset to the local machine as CSV File
csv_file = 'comments_data.csv'  # Name your file
hdf.to_csv(csv_file, index=False)

from google.colab import files

# Trigger a download to your local machine
files.download(csv_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>