In [1]:
import json
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

`get_number_of_video_with_subs` query youtube search with channel name in search field with `Subtitles/CC` in features filters and count how many video is from the channel queried.  
Note: In the initial page load, Youtube will only return a certain amount of video (usually 20), so the number returned here is not the number of video in the channel with subtitles. It is only used to check if the channel has a video with subtitles or not.

In [2]:
def get_number_of_video_with_subs(channel_username):
    # Send a request to the web page and get its HTML content
    url = f"https://www.youtube.com/results?search_query={channel_username}&sp=EgIoAQ%253D%253D"
    response = requests.get(url)
    html_content = response.text

    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(html_content, features='html.parser')

    # Extract the JavaScript code containing the variable you want to scrape
    js_code = soup.find_all('script')
    # Loop through all the script tags and extract the value of the ytInitialData variable
    for script in js_code:
        if script.string != None and 'var ytInitialData' in script.string:
            # Extract the variable value from the JavaScript code and convert it to a dictionary
            data = json.loads(script.string.split("=", 1)[1][:-1])
            break

    # count number of video from the selected channel
    t1 = data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"][0]
    number_of_video = 0
    video_list = t1["itemSectionRenderer"]["contents"]
    if len(video_list) < 2:
        return 0
    for video in video_list:
        # Search Results may contain ads so we need to skip them
        if "videoRenderer" not in video.keys():
            continue
        # extract channel username
        extracted_channel_username = video["videoRenderer"]["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["canonicalBaseUrl"][1:]
        if extracted_channel_username.lower() == channel_username.lower():
            number_of_video += 1

    return number_of_video


In [3]:
get_number_of_video_with_subs("@truevisionsofficial")

1

In [4]:
# extract channel id from url
with open("data_sample/channelcrawler_th_top100.txt", "r") as f:
    channel_url_list = f.read().split()

channel_username_list = [churl.rsplit("/", 1)[1] for churl in channel_url_list]

result = {}
ch_with_subs = []
for channel_username in tqdm(channel_username_list):
    try:
        num_vids_with_subs = get_number_of_video_with_subs(channel_username)
    except Exception as e:
        print(e)
        print(channel_username)
    result[channel_username] = num_vids_with_subs
    if num_vids_with_subs > 10:
        ch_with_subs.append(channel_username)


  0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
result

{'@mtr9999': 0,
 '@108lifeth': 0,
 '@skizztv56': 0,
 '@bongamerz': 0,
 '@morningnewstv3': 0,
 '@bosskerati': 20,
 '@zbingz': 20,
 '@matichontv': 0,
 '@nontakans': 0,
 '@notisgodchannel': 0,
 '@genelab': 11,
 '@hampmn': 0,
 '@kiddeetv': 0,
 '@4sthailand': 0,
 '@deksorkrao': 4,
 '@workpointofficial': 19,
 '@oganic': 0,
 '@thaich8news': 0,
 '@skylaxy': 1,
 '@familygangofficial': 0,
 '@jokerfamily2015': 0,
 '@peacheatlaek': 20,
 '@kidssongnamo': 0,
 '@aoffymaxim': 1,
 '@overact': 7,
 '@hehaatvchannel': 0,
 '@janesergo': 0,
 '@ch3thailand': 15,
 '@mr.b-2023': 0,
 '@tvthunderofficial': 0,
 '@topbytoast': 0,
 '@tonmaimusicstudio': 2,
 '@oppathuchy': 16,
 '@teromusic': 12,
 '@wiwawawowtv': 0,
 '@manshowhowweare': 0,
 '@one31official': 0,
 '@newxlives': 0,
 '@smallroomofficial': 3,
 '@thanakornzaadoat': 0,
 '@fairydollchannel': 0,
 '@ruok1': 0,
 '@whattheducktube': 0,
 '@wongklomfc1': 0,
 '@iambuilder': 0,
 '@tigerzap': 0,
 '@tintinandthegang': 0,
 '@majorgroup': 1,
 '@mcotofficial': 0,
 '@mark

In [6]:
len(ch_with_subs)

16

In [7]:
ch_with_subs

['@bosskerati',
 '@zbingz',
 '@genelab',
 '@workpointofficial',
 '@peacheatlaek',
 '@ch3thailand',
 '@oppathuchy',
 '@teromusic',
 '@mymatenate',
 '@samsearnofficial',
 '@thaipbs',
 '@spicydisc',
 '@fedfeclip',
 '@gmmtv',
 '@indysongkids',
 '@sunbeary']

# Write Channel URLs to text file

In [8]:
with open("channel_with_subs.txt", "w") as f:
    ch_with_subs_url = [f"https://youtube.com/{ch}" for ch in ch_with_subs]
    f.write("\n".join(ch_with_subs_url))