# YouTube API Data Retrieval
NOTE: Since it takes time and resources to retrieve the YouTube data, I will not re-run the lines.

## 0. Preparation: Install Necessary Python Packages （google-api-python-client and youtube-transcript-api）

In [1]:
# !pip install google-api-python-client youtube-transcript-api
# !pip install lxml requests
# ! pip install -r requirements.txt
# ! pip install --upgrade google-api-python-client
# ! pip install textblob
# ! pip install selenium
#! pip install langdetect

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
import requests
import re
from lxml import html
from googleapiclient.discovery import build
import datetime
import time
import json
import pandas as pd
import numpy as np
import csv
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

## 1. Retrieve Youtube Video IDs
This is fulfilled using the Youtube API v3.

Initialize the api_key. This is activated from the google cloud console.

In [3]:
api_key = 'AIzaSyBBtbun3W1dFNNJd9UT5E4cIzlBqnfR2hQ'

Define a function that retrieves the relevant Youtube video IDs.

Due to request limitations, the requests are retrieved year-by-year, so that the queries are not exhausted. If the query is used up, a back-up API is used to continue with data retrieval.

In [4]:
def retrieve_youtube_video_ids(api_key, published_after, published_before, max_results=1500):
    # Build the YouTube client
    youtube = build('youtube', 'v3', developerKey=api_key)
    
    # List to store video IDs
    video_ids = []
    
    # Initial search parameters
    request = youtube.search().list(
        part="id",
        type="video",
        maxResults=50,  # Maximum allowed per API documentation
        q="disability",  # Search query
        publishedAfter=published_after,
        publishedBefore=published_before
    )
    
    while request and len(video_ids) < max_results:
        # Execute the API request
        response = request.execute()
        
        # Extract video IDs from the response
        video_ids.extend([item['id']['videoId'] for item in response.get('items', [])])
        
        if 'nextPageToken' in response:
            request = youtube.search().list_next(request, response)
        else:
            break
        
        # Pause to avoid quota errors
        time.sleep(1)
    
    return video_ids[:max_results]

Run the function to retrieve relevant videos, and save them separately in json files.

In [5]:
# Define a save json function for easier use later.
def save_json(file_name, data):
    with open(file_name, 'w') as f:
        json.dump(data, f)

In [6]:
video_ids16 = retrieve_youtube_video_ids(api_key, '2016-01-01T00:00:00Z', '2016-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids16)} video IDs in 2016.")

save_json('Disability_video_ids16.json', video_ids16)

Retrieved 514 video IDs in 2016.


In [7]:
video_ids17 = retrieve_youtube_video_ids(api_key, '2017-01-01T00:00:00Z', '2017-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids17)} video IDs in 2017.")

save_json('Disability_video_ids17.json', video_ids17)

Retrieved 560 video IDs in 2017.


In [8]:
video_ids18 = retrieve_youtube_video_ids(api_key, '2018-01-01T00:00:00Z', '2018-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids18)} video IDs in 2018.")

save_json('Disability_video_ids18.json', video_ids18)

Retrieved 554 video IDs in 2018.


In [9]:
video_ids19 = retrieve_youtube_video_ids(api_key, '2019-01-01T00:00:00Z', '2019-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids19)} video IDs in 2019.")

save_json('Disability_video_ids19.json', video_ids19)

Retrieved 550 video IDs in 2019.


In [10]:
video_ids20 = retrieve_youtube_video_ids(api_key, '2020-01-01T00:00:00Z', '2020-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids20)} video IDs in 2020.")

save_json('Disability_video_ids20.json', video_ids20)

Retrieved 609 video IDs in 2020.


In [11]:
video_ids21 = retrieve_youtube_video_ids(api_key, '2021-01-01T00:00:00Z', '2021-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids21)} video IDs in 2021.")

save_json('Disability_video_ids21.json', video_ids21)

Retrieved 624 video IDs in 2021.


In [12]:
video_ids22 = retrieve_youtube_video_ids(api_key, '2022-01-01T00:00:00Z', '2022-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids22)} video IDs in 2022.")

save_json('Disability_video_ids22.json', video_ids22)

Retrieved 579 video IDs in 2022.


In [13]:
video_ids23 = retrieve_youtube_video_ids(api_key, '2023-01-01T00:00:00Z', '2023-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids23)} video IDs in 2023.")

save_json('Disability_video_ids23.json', video_ids23)

Retrieved 602 video IDs in 2023.


In [15]:
video_ids24 = retrieve_youtube_video_ids(api_key, '2024-01-01T00:00:00Z', '2024-12-31T23:59:59Z')
print(f"Retrieved {len(video_ids24)} video IDs in 2024.")

save_json('Disability_video_ids24.json', video_ids24)

Retrieved 546 video IDs in 2024.


In [16]:
# List of file names to merge
file_names = ["Disability_video_ids16.json", 
              "Disability_video_ids17.json", 
              "Disability_video_ids18.json", 
              "Disability_video_ids19.json", 
              "Disability_video_ids20.json", 
              "Disability_video_ids21.json", 
              "Disability_video_ids22.json", 
              "Disability_video_ids23.json", 
              "Disability_video_ids24.json"]

# Initialize a list to hold all combined data
merged_files = []

# Loop through each file and read the data
for file in file_names:
    with open(file, 'r') as f:
        data = json.load(f)
        merged_files.extend(data)  # Assuming data is a list and can be extended

# Write the combined data to a new file
with open('Disability_video_ids_16-24.json', 'w') as f:
    json.dump(merged_files, f)

print("Files have been merged successfully.")

Files have been merged successfully.


## 2. Retrieve relevant parameters for future analysis

Using the Google API, the following elements are retrieved:
1. the title of the video;
2. the description of the video;
3. the published year of the video;
4. the channel of the video.
These are all publically available through the Google official API, which can be seen in the documentation (https://developers.google.com/youtube/v3/docs/?apix=true).

Load the video ids retrieved in the first step.

In [21]:
# Function to fetch and save video details to a CSV file
def fetch_and_save_video_details(input_json, output_csv, completion_message):
    # Load video IDs from JSON file
    with open(input_json, 'r') as f:
        disability_video_ids = json.load(f)

    # List to store details of all videos
    video_details = []
    
    # Loop through all video IDs to get their details
    for video_id in disability_video_ids:
        # Make an API call to get the video details
        youtube = build('youtube', 'v3', developerKey=api_key)
        response = youtube.videos().list(
            part='snippet',
            id=video_id
        ).execute()

        # Check if the response contains any items
        if 'items' in response and response['items']:
            item = response['items'][0]
            title = item['snippet']['title']
            description = item['snippet']['description']
            publish_year = item['snippet']['publishedAt'][:4]  # Taking the first four characters gives us the year
            channel = item['snippet']['channelTitle']

            # Append details including video ID
            video_details.append({
                'Video ID': video_id,
                'Title': title,
                'Published Year': publish_year,
                'Description': description,
                'Channel': channel
            })
        else:
            video_details.append({
                'Video ID': video_id,
                'Error': 'Video not found'
            })

    # Saving the details to a CSV file
    with open(output_csv, 'w', newline='') as f:
        fieldnames = ['Video ID', 'Title', 'Published Year', 'Description', 'Channel']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(video_details)

    # Print completion message with the output file name included
    print(completion_message.format(output_csv))

In [22]:
api_key = 'AIzaSyBBtbun3W1dFNNJd9UT5E4cIzlBqnfR2hQ'

In [19]:
# back-up API key
# api_key = 'AIzaSyDXvl5gVJP5Kqy2m8rVaVXTihGjrnD_MPA'

In [23]:
fetch_and_save_video_details('Disability_video_ids16.json', 'Disability_videos_16.csv', "Video details for 2016 fetched and saved successfully in {}.")

Video details for 2016 fetched and saved successfully in Disability_videos_16.csv.


In [24]:
fetch_and_save_video_details('Disability_video_ids17.json', 'Disability_videos_17.csv', "Video details for 2017 fetched and saved successfully in {}.")

Video details for 2017 fetched and saved successfully in Disability_videos_17.csv.


In [25]:
fetch_and_save_video_details('Disability_video_ids18.json', 'Disability_videos_18.csv', "Video details for 2018 fetched and saved successfully in {}.")

Video details for 2018 fetched and saved successfully in Disability_videos_18.csv.


In [26]:
fetch_and_save_video_details('Disability_video_ids19.json', 'Disability_videos_19.csv', "Video details for 2019 fetched and saved successfully in {}.")

Video details for 2019 fetched and saved successfully in Disability_videos_19.csv.


In [27]:
fetch_and_save_video_details('Disability_video_ids20.json', 'Disability_videos_20.csv', "Video details for 2020 fetched and saved successfully in {}.")

Video details for 2020 fetched and saved successfully in Disability_videos_20.csv.


In [28]:
fetch_and_save_video_details('Disability_video_ids21.json', 'Disability_videos_21.csv', "Video details for 2021 fetched and saved successfully in {}.")

Video details for 2021 fetched and saved successfully in Disability_videos_21.csv.


In [29]:
fetch_and_save_video_details('Disability_video_ids22.json', 'Disability_videos_22.csv', "Video details for 2022 fetched and saved successfully in {}.")

Video details for 2022 fetched and saved successfully in Disability_videos_22.csv.


In [30]:
fetch_and_save_video_details('Disability_video_ids23.json', 'Disability_videos_23.csv', "Video details for 2023 fetched and saved successfully in {}.")

Video details for 2023 fetched and saved successfully in Disability_videos_23.csv.


In [31]:
fetch_and_save_video_details('Disability_video_ids24.json', 'Disability_videos_24.csv', "Video details for 2024 fetched and saved successfully in {}.")

Video details for 2024 fetched and saved successfully in Disability_videos_24.csv.


## 3. Data Cleaning and Pre-processing

Combine all csvs to form the final video details csv

In [32]:
file_names = [
    'Disability_videos_16.csv',
    'Disability_videos_17.csv',
    'Disability_videos_18.csv',
    'Disability_videos_19.csv',
    'Disability_videos_20.csv',
    'Disability_videos_21.csv',
    'Disability_videos_22.csv',
    'Disability_videos_23.csv',
    'Disability_videos_24.csv'
]

# Use pandas to read and concatenate all files into one DataFrame
combined_df = pd.concat([pd.read_csv(f) for f in file_names], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('Disability_videos_combined.csv', index=False)

print("All files have been combined into Disability_videos_combined.csv.")

All files have been combined into Disability_videos_combined.csv.


the total sample consists of 5138 videos.

Shuffle the rows for labeling

In [33]:
# Load the CSV file
df = pd.read_csv('Disability_videos_combined.csv')

# Shuffle the rows of the DataFrame
shuffled_df = df.sample(frac=1).reset_index(drop=True)

# Save the shuffled DataFrame back to CSV
shuffled_df.to_csv('Disability_videos_combined_shuffled.csv', index=False)

Drop Nas and non-english Descriptions

In [34]:
shuffled_df = shuffled_df.dropna(subset=['Description'])

4636 videos are left after dropping the NA videos.

Drop the non-english descriptions:
Here, the langdetect library is used to detect languages. https://pypi.org/project/langdetect/

In [36]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Filter for English descriptions
shuffled_df['is_english'] = shuffled_df['Description'].apply(is_english)
shuffled_df = shuffled_df[shuffled_df['is_english']]
shuffled_df.drop('is_english', axis=1, inplace=True)

4349 videos are left after dropping the non-English descriptions.

Save the final video details csv into Disability_videos_combined_shuffled_cleaned.csv

In [37]:
# Save the cleaned data
shuffled_df.to_csv('Disability_videos_combined_shuffled_cleaned.csv', index=False)

Split the csv file into two parts, one for labeling and training, one for future classification.

In [38]:
shuffled_df_labeled = shuffled_df.iloc[:251]
shuffled_df_unlabeled = shuffled_df.iloc[251:]

shuffled_df_labeled.to_csv('Disability_videos_combined_shuffled_cleaned_labeled.csv', index=False)
shuffled_df_unlabeled.to_csv('Disability_videos_combined_shuffled_cleaned_unlabeled.csv', index=False)

print(f"CSV file split into Disability_videos_combined_shuffled_cleaned_labeled.csv and Disability_videos_combined_shuffled_cleaned_unlabeled.csv.")

CSV file split into Disability_videos_combined_shuffled_cleaned_labeled.csv and Disability_videos_combined_shuffled_cleaned_unlabeled.csv.
