# IMT 570 Secondary Data Collection - YouTube API

### Section 1: Configueration, Authentication, Preperation

In [2]:
!pip install --upgrade google-api-python-client --quiet

In [3]:
import re
import nltk
from nltk.stem import PorterStemmer

# Import relevant packages
import json
import googleapiclient
import googleapiclient.discovery
import googleapiclient.errors

In [6]:
api_key = "AIzaSyAFkh7VXLquY7VOKsKm2mJ_7RHM7n_PxwQ"
# Initializing Youtube API
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

### Section 2 
Functions: 
- Getting videos based on query; 
- Getting comments in videos

In [11]:
def search_keyword_videos(query): # maximum 50 videos per call
    relevant_videos = []
    next_page_token = None
    while len(relevant_videos) < 50:
        request = youtube.search().list(
            part='snippet, id',
            q=query,  # Replace for 3 different RQs
            maxResults=50,
            pageToken=next_page_token,
            order='relevance'
        )
        response = request.execute()  
        stemmed_keywords = set(porter.stem(q) for q in query)
        
        for item in response.get('items', []):
            video_title = item['snippet']['title'].lower()
            if any(re.search(r'\b{}\b'.format(re.escape(porter.stem(keyword))), video_title) for keyword in stemmed_keywords):
                try:
                    video_id = item['id']['videoId']
                    # this is for viewCount
                    view_count = youtube.videos().list(
                    id=video_id,
                    part='statistics'
                    ).execute()

                    video_info = {
                        'Video ID': video_id,
                        'Video title': item['snippet']['title'],
                        'Video channel': item['snippet']['channelTitle'],
                        'Video creation time': item['snippet']['publishedAt'],
                        'Video number of views': view_count['items'][0]['statistics']['viewCount']       
                    }
                    relevant_videos.append(video_info)
                except KeyError:
                    continue
            if len(relevant_videos) >= 50:
                break
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    print(f"Found {len(relevant_videos)} keyword-related videos.")
    return relevant_videos

In [13]:
from googleapiclient.errors import HttpError

def extract_30_comments(video_id_list): 
    comments_per_video = []
    for vid in video_id_list:
        try:
            request = youtube.commentThreads().list(
                videoId = vid,
                part = "id,snippet,replies",
                textFormat = "plainText",
                order = "relevance",
                maxResults = 30 # any number between 1 - 100
            )
            response = request.execute()
            
            for item in response["items"]:
                comments = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                has_reply = item["snippet"]["totalReplyCount"]
                if has_reply != 0:
                    replies = []
                    for i in range(len(item["replies"]["comments"])):
                        reply = item["replies"]["comments"][i]['snippet']["textDisplay"]
                        replies.append(reply)
                else:
                    replies = None
                comment_info = {
                    'Video ID': vid,
                    'Comment id': item['snippet']['topLevelComment']['id'],
                    # 'Comment title': item['snippet']['topLevelComment']['snippet']['textOriginal'],
                    'Comment creation time': item['snippet']['topLevelComment']['snippet']['publishedAt'],
                    'Comment number of likes': item['snippet']['topLevelComment']['snippet']['likeCount'],
                    'Comment content': comments,
                    'Replies': replies
                }
                comments_per_video.append(comment_info)
        except HttpError as e:
            if e.resp.status == 403:
                print(f"Comments are disabled for video with ID: {vid}")
                continue
            else:
                raise e  # Re-raise the exception if it's not a 403 error
    return comments_per_video

### Section 3: Queries for Research Questions 1 - 3

In [16]:
# How do students use GPT in study
gpt_study_how = ('gpt study techniques OR ' +
                 'gpt study methods OR ' +
                 'gpt study applications OR ' +
                 'student gpt utilization OR ' +
                 'gpt study strategies OR ' +
                 'gpt study practices OR ' +
                 'gpt study approaches OR ' +
                 'gpt study tools OR ' +
                 'gpt study resources OR ' +
                 'gpt study aids OR ' +
                 'gpt study assistance OR ' +
                 'gpt study support OR ' +
                 'gpt study applications OR ' +
                 'gpt study integration OR ' +
                 'gpt study incorporation OR ' +
                 'gpt study utilization OR ' +
                 'gpt study implementation OR ' +
                 'gpt study techniques OR ' +
                 'gpt study practices OR ' +
                 'gpt study habits OR ' +
                 'gpt study routines')

In [18]:
# GPT's effect on learning outcome
gpt_learning_impact = ('gpt learning impact OR ' +
                       'gpt learning influence OR ' +
                       'gpt learning consequences OR ' +
                       'gpt learning results OR ' +
                       'gpt learning effects OR ' +
                       'gpt learning outcomes OR ' +
                       'gpt learning benefits OR ' +
                       'gpt learning advantages OR ' +
                       'gpt learning improvements OR ' +
                       'gpt learning enhancements OR ' +
                       'gpt learning gains OR ' +
                       'gpt learning progress OR ' +
                       'gpt learning development OR ' +
                       'gpt learning advancement OR ' +
                       'gpt learning efficacy OR ' +
                       'gpt learning efficiency OR ' +
                       'gpt learning effectiveness OR ' +
                       'gpt learning performance OR ' +
                       'gpt learning success OR ' +
                       'gpt learning achievement OR ' +
                       'gpt learning proficiency OR ' +
                       'gpt learning mastery')

In [20]:
# # Subject differences in using GPT for study
gpt_subject_difference = ('gpt stem education OR ' +
                          'gpt stem subjects OR ' +
                          'gpt stem learning OR ' +
                          'gpt stem applications OR ' +
                          'gpt stem impact OR ' +
                          'gpt stem effectiveness OR ' +
                          'gpt stem outcomes OR ' +
                          'gpt stem benefits OR ' +
                          'gpt stem advantages OR ' +
                          'gpt stem performance OR ' +
                          'gpt stem achievement OR ' +
                          'gpt stem progress OR ' +
                          'gpt stem development OR ' +
                          'gpt stem efficacy OR ' +
                          'gpt stem efficiency OR ' +
                          'gpt stem success OR ' +
                          'gpt non-stem education OR ' +
                          'gpt non-stem subjects OR ' +
                          'gpt non-stem learning OR ' +
                          'gpt non-stem applications OR ' +
                          'gpt non-stem impact OR ' +
                          'gpt non-stem effectiveness OR ' +
                          'gpt non-stem outcomes')

### Section 4: Data Collection

#### How do students use GPT in study

In [24]:
porter = PorterStemmer()
relevant_videos_gpt_study_how = search_keyword_videos(gpt_study_how)

Found 50 keyword-related videos.


In [26]:
# Extract all video ids from the list of libraries return from search_keyword_videos function
gpt_study_how_videoids = [d['Video ID'] for d in relevant_videos_gpt_study_how if 'Video ID' in d]

In [28]:
# Extract Comments
gpt_study_how_comments = extract_30_comments(gpt_study_how_videoids)

#### GPT's effect on learning outcome

In [31]:
porter = PorterStemmer()
relevant_videos_gpt_learning_impact = search_keyword_videos(gpt_learning_impact)

Found 50 keyword-related videos.


In [32]:
# Extract all video ids from the list of libraries return from search_keyword_videos function
gpt_learning_impact_videoids = [d['Video ID'] for d in relevant_videos_gpt_learning_impact if 'Video ID' in d]

In [33]:
# Extract Comments
gpt_learning_impact_comments = extract_30_comments(gpt_learning_impact_videoids)

Comments are disabled for video with ID: --khbXchTeE


#### Subject differences in using GPT for study

In [35]:
porter = PorterStemmer()
relevant_videos_gpt_subject_difference = search_keyword_videos(gpt_subject_difference)

Found 50 keyword-related videos.


In [36]:
# Extract all video ids from the list of libraries return from search_keyword_videos function
gpt_subject_difference_videoids = [d['Video ID'] for d in relevant_videos_gpt_subject_difference if 'Video ID' in d]

In [37]:
# # Extract Comments
gpt_subject_difference_comments = extract_30_comments(gpt_subject_difference_videoids)

Comments are disabled for video with ID: HdIppwUJ0f8
Comments are disabled for video with ID: ttIOdAdQaUE
Comments are disabled for video with ID: EZ0IjBXMME8
Comments are disabled for video with ID: fjUUpMI2x1k
Comments are disabled for video with ID: fjUUpMI2x1k


### Creating DataFrames

In [41]:
import pandas as pd

In [42]:
# Video info dataframe
gpt_study_how_df = pd.DataFrame(relevant_videos_gpt_study_how)
gpt_learning_impact_df = pd.DataFrame(relevant_videos_gpt_learning_impact)
gpt_subject_difference_df = pd.DataFrame(relevant_videos_gpt_subject_difference)

# Comments info dataframe
gpt_study_how_comment_df = pd.DataFrame(gpt_study_how_comments)
gpt_learning_impact_comment_df = pd.DataFrame(gpt_learning_impact_comments)
gpt_subject_difference_comment_df = pd.DataFrame(gpt_subject_difference_comments)

In [49]:
# Join dataframes for cnn and fox by "Video ID"
how_df = pd.merge(gpt_study_how_comment_df, gpt_study_how_df, on = 'Video ID', how = 'left' )
impact_df = pd.merge(gpt_learning_impact_comment_df, gpt_learning_impact_df, on = 'Video ID', how = 'left' )
subject_df = pd.merge(gpt_subject_difference_comment_df, gpt_subject_difference_df, on = 'Video ID', how = 'left' )

In [51]:
how_df.head(2)

Unnamed: 0,Video ID,Comment id,Comment creation time,Comment number of likes,Comment content,Replies,Video title,Video channel,Video creation time,Video number of views
0,JZ_flEGANBw,UgwuCX475lju2-tlWHV4AaABAg,2023-09-01T14:58:22Z,526,A bunch of my professors have been freaking ou...,[Would love to talk to your professors about t...,How to learn anything fast using ChatGPT | Ful...,Cajun Koi Academy,2023-09-01T14:31:41Z,343664
1,JZ_flEGANBw,UgzClGkl0S_ao7Rd8Ah4AaABAg,2023-09-02T11:30:41Z,23,Plot Twist: All research and the script was ma...,,How to learn anything fast using ChatGPT | Ful...,Cajun Koi Academy,2023-09-01T14:31:41Z,343664


In [55]:
impact_df.head(2)

Unnamed: 0,Video ID,Comment id,Comment creation time,Comment number of likes,Comment content,Replies,Video title,Video channel,Video creation time,Video number of views
0,uml2fX7JDjI,UgwkVRSaF-AHSVy3JMF4AaABAg,2023-04-19T17:26:40Z,0,Thanks for sharing the link for the full blog ...,,How will ChatGPT impact education?,Harvard Online,2023-04-19T16:26:21Z,4601
1,uml2fX7JDjI,UgwGBtkqnu1De6VsRiJ4AaABAg,2023-11-26T23:12:09Z,0,I agree. Sometimes we become afraid of new le...,,How will ChatGPT impact education?,Harvard Online,2023-04-19T16:26:21Z,4601


In [63]:
subject_df.tail(100)

Unnamed: 0,Video ID,Comment id,Comment creation time,Comment number of likes,Comment content,Replies,Video title,Video channel,Video creation time,Video number of views
251,KgygRCdHbmc,UgwGnHPupUYlMXlNQqV4AaABAg,2023-01-17T16:54:26Z,0,"Great summary video! Thanks for your calming,...",[Glad it was helpful!],How Will Schools Respond to the A.I. Revolutio...,John Spencer,2023-01-10T02:01:40Z,92119
252,KgygRCdHbmc,UgzmnoyJNg43XVUI8kx4AaABAg,2023-04-09T10:47:46Z,0,I loved your video. Very interesting the disru...,,How Will Schools Respond to the A.I. Revolutio...,John Spencer,2023-01-10T02:01:40Z,92119
253,KgygRCdHbmc,UgwLptgAd6TQCfB3ux14AaABAg,2023-02-08T09:54:46Z,2,Hello Mr. John thank you for the wonderfully c...,[Glad it was helpful!],How Will Schools Respond to the A.I. Revolutio...,John Spencer,2023-01-10T02:01:40Z,92119
254,KgygRCdHbmc,Ugzeu08y8N7SkvWrABB4AaABAg,2023-07-28T13:35:31Z,1,Hey John - nice vid - have subbed. Can I ask w...,,How Will Schools Respond to the A.I. Revolutio...,John Spencer,2023-01-10T02:01:40Z,92119
255,KgygRCdHbmc,Ugy2grInrVXzU5n1BHJ4AaABAg,2023-05-04T14:49:21Z,1,This is an excellent outlook on AI and educati...,[Thank you!],How Will Schools Respond to the A.I. Revolutio...,John Spencer,2023-01-10T02:01:40Z,92119
...,...,...,...,...,...,...,...,...,...,...
346,0rjuQMW9obE,UgwCyHAyu3DAupeYSAF4AaABAg,2022-09-20T13:53:31Z,0,Curious what the speech-trees and facial expre...,,I Took a STEM Class Taught by an AI Robot,Mashable,2022-09-20T13:38:04Z,3241
347,0rjuQMW9obE,UgxoTa9YcB2zkYsUYGJ4AaABAg,2023-01-26T01:41:53Z,0,Wow!👍👍💗💗,,I Took a STEM Class Taught by an AI Robot,Mashable,2022-09-20T13:38:04Z,3241
348,0rjuQMW9obE,UgwgZlWDla0_IK2Zws54AaABAg,2022-09-20T15:59:43Z,3,But what is.. A woman,,I Took a STEM Class Taught by an AI Robot,Mashable,2022-09-20T13:38:04Z,3241
349,0rjuQMW9obE,Ugzo2n-Xs244CFABQcd4AaABAg,2022-09-20T16:17:33Z,2,A shameless gimmick that anyone with basic un...,[But we have teacher shortages all over the co...,I Took a STEM Class Taught by an AI Robot,Mashable,2022-09-20T13:38:04Z,3241


In [None]:
# yt_comments = pd.concat([how_df, impact_df, subject_df])