In [1]:
import os, sys, re
import requests
import numpy as np
from urllib.parse import urljoin
from bs4 import BeautifulSoup

In [2]:
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
    """Save web page html and supported contents        
        * pagepath : path-to-page   
        It will create a file  `'path-to-page'.html` and a folder `'path-to-page'_files`
    """
    def savenRename(soup, pagefolder, session, url, tag, inner):
        if not os.path.exists(pagefolder): # create only once
            os.mkdir(pagefolder)
        for res in soup.findAll(tag):   # images, css, etc..
            if res.has_attr(inner): # check inner tag (file object) MUST exists  
                try:
                    filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
                    filename = re.sub('\W+', '', filename) + ext # clean special chars from name
                    fileurl = urljoin(url, res.get(inner))
                    filepath = os.path.join(pagefolder, filename)
                    # rename html ref so can move html and folder of files anywhere
                    res[inner] = os.path.join(os.path.basename(pagefolder), filename)
                    if not os.path.isfile(filepath): # was not downloaded
                        with open(filepath, 'wb') as file:
                            filebin = session.get(fileurl)
                            file.write(filebin.content)
                except Exception as exc:
                    print(exc, file=sys.stderr)
    if not html:
        html = session.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    path, _ = os.path.splitext(pagepath)
    pagefolder = path+'_files' # page contents folder
    tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
    for tag, inner in tags_inner.items(): # saves resource files and rename refs
        savenRename(soup, pagefolder, session, url, tag, inner)
    with open(path+'.html', 'wb') as file: # saves modified html doc
        file.write(soup.prettify('utf-8'))

In [3]:
saveFullHtmlPage('https://www.youtube.com/results?search_query=love', 'love')


In [4]:
with open("love.html") as fp:
        soup = BeautifulSoup(fp, 'html.parser')

In [5]:
print(type(soup.select("body")[0].select("script")[13]))
# "videoIds": ["

<class 'bs4.element.Tag'>


In [6]:
a = soup.select("body")[0]


In [7]:
paragraphs = []
for x in a:
    paragraphs.append(str(x))

In [8]:
paragraphs

['\n',
 '<script nonce="6qdkeKuW7aHgsfRx9zi36A">\n   if (window.ytcsi) {window.ytcsi.tick(\'bs\', null, \'\');}\n  </script>',
 '\n',
 '<script nonce="6qdkeKuW7aHgsfRx9zi36A">\n   ytcfg.set(\'initialBodyClientWidth\', document.body.clientWidth);\n  </script>',
 '\n',
 '<script nonce="6qdkeKuW7aHgsfRx9zi36A">\n   if (window.ytcsi) {window.ytcsi.tick(\'ai\', null, \'\');}\n  </script>',
 '\n',
 '<iframe name="passive_signin" src="https://accounts.google.com/ServiceLogin?service=youtube&amp;uilel=3&amp;passive=true&amp;continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26hl%3Den%26next%3D%252Fsignin_passive%26feature%3Dpassive&amp;hl=en" style="display: none">\n</iframe>',
 '\n',
 '<ytd-app>\n<ytd-masthead class="shell" id="masthead" logo-type="YOUTUBE_LOGO" slot="masthead">\n<div class="ytd-searchbox-spt" id="search-container" slot="search-container">\n</div>\n<div class="ytd-searchbox-spt" id="search-input" slot="search-input">\n<input autocapi

In [9]:
for i, a in enumerate(paragraphs):
    if "\"videoIds\":[\"" in a:
        b = paragraphs[31]

In [10]:
"\"videoIds\":[\"" in b

True

In [11]:
match=(re.search("videoIds", b))


In [12]:
match


<re.Match object; span=(18296, 18304), match='videoIds'>

In [13]:
b[7998: 8009]

'C5LLTp6hgY4'

In [14]:
result = [_.start() for _ in re.finditer("\"videoIds\":", b)]
result

[18295,
 18342,
 21220,
 21267,
 26540,
 26587,
 29438,
 29485,
 34076,
 34123,
 36999,
 37046,
 41586,
 41633,
 44503,
 44550,
 49863,
 49910,
 52627,
 52674,
 58072,
 58119,
 60891,
 60938,
 65642,
 65689,
 68540,
 68587,
 73440,
 73487,
 76417,
 76464,
 81235,
 81282,
 84135,
 84182,
 89250,
 89297,
 92172,
 92219,
 97068,
 97115,
 99993,
 100040,
 104663,
 104710,
 107614,
 107661,
 112388,
 112435,
 115315,
 115362,
 119819,
 119866,
 122773,
 122820,
 127889,
 127936,
 130812,
 130859,
 135527,
 135574,
 138468,
 138515,
 143005,
 143052,
 145920,
 145967,
 151605,
 151652,
 154503,
 154550,
 164180,
 164227,
 167105,
 167152,
 172139,
 172186,
 175049,
 175096,
 179919,
 179966,
 182900,
 182947,
 187513,
 187560,
 190460,
 190507,
 195423,
 195470,
 198348,
 198395,
 202796,
 202843,
 205713,
 205760,
 210578,
 210625,
 213478,
 213525,
 218209,
 218256,
 221190,
 221237,
 226204,
 226251,
 229102,
 229149,
 233663,
 233710,
 236611,
 236658,
 241486,
 241533,
 244418,
 244465,

In [15]:
b[7985]+ b[7986] + b[7987]+ b[7988]+ b[7989]+ b[7990]+ b[7991]+ b[7992]+ b[7993]+ b[7994]+ b[7995]+ b[7996]

'eraction/?ai'

In [16]:
result

[18295,
 18342,
 21220,
 21267,
 26540,
 26587,
 29438,
 29485,
 34076,
 34123,
 36999,
 37046,
 41586,
 41633,
 44503,
 44550,
 49863,
 49910,
 52627,
 52674,
 58072,
 58119,
 60891,
 60938,
 65642,
 65689,
 68540,
 68587,
 73440,
 73487,
 76417,
 76464,
 81235,
 81282,
 84135,
 84182,
 89250,
 89297,
 92172,
 92219,
 97068,
 97115,
 99993,
 100040,
 104663,
 104710,
 107614,
 107661,
 112388,
 112435,
 115315,
 115362,
 119819,
 119866,
 122773,
 122820,
 127889,
 127936,
 130812,
 130859,
 135527,
 135574,
 138468,
 138515,
 143005,
 143052,
 145920,
 145967,
 151605,
 151652,
 154503,
 154550,
 164180,
 164227,
 167105,
 167152,
 172139,
 172186,
 175049,
 175096,
 179919,
 179966,
 182900,
 182947,
 187513,
 187560,
 190460,
 190507,
 195423,
 195470,
 198348,
 198395,
 202796,
 202843,
 205713,
 205760,
 210578,
 210625,
 213478,
 213525,
 218209,
 218256,
 221190,
 221237,
 226204,
 226251,
 229102,
 229149,
 233663,
 233710,
 236611,
 236658,
 241486,
 241533,
 244418,
 244465,

In [17]:
result = np.add(result, 13).tolist()


In [18]:
result

[18308,
 18355,
 21233,
 21280,
 26553,
 26600,
 29451,
 29498,
 34089,
 34136,
 37012,
 37059,
 41599,
 41646,
 44516,
 44563,
 49876,
 49923,
 52640,
 52687,
 58085,
 58132,
 60904,
 60951,
 65655,
 65702,
 68553,
 68600,
 73453,
 73500,
 76430,
 76477,
 81248,
 81295,
 84148,
 84195,
 89263,
 89310,
 92185,
 92232,
 97081,
 97128,
 100006,
 100053,
 104676,
 104723,
 107627,
 107674,
 112401,
 112448,
 115328,
 115375,
 119832,
 119879,
 122786,
 122833,
 127902,
 127949,
 130825,
 130872,
 135540,
 135587,
 138481,
 138528,
 143018,
 143065,
 145933,
 145980,
 151618,
 151665,
 154516,
 154563,
 164193,
 164240,
 167118,
 167165,
 172152,
 172199,
 175062,
 175109,
 179932,
 179979,
 182913,
 182960,
 187526,
 187573,
 190473,
 190520,
 195436,
 195483,
 198361,
 198408,
 202809,
 202856,
 205726,
 205773,
 210591,
 210638,
 213491,
 213538,
 218222,
 218269,
 221203,
 221250,
 226217,
 226264,
 229115,
 229162,
 233676,
 233723,
 236624,
 236671,
 241499,
 241546,
 244431,
 244478

In [19]:
settt = set()
for n in result:
    settt.add(b[n:n+11])

In [20]:
settt = list(settt)

In [21]:
settt

['KC8kOyaxX5I',
 'seeYKY17780',
 'UZWmtxLiiFE',
 'KHPpW_ScIc8',
 'DRg7cbsFJ34',
 'dNtcLPDSwB0',
 'omEuqt19Uhs',
 'UT-dIXigLFE',
 'ox7RsX1Ee34',
 '0pObLdN9ICg',
 '9PBZy9j3H3I',
 'kg_Ne7FknWc',
 'KV2ssT8lzj8',
 'w6NxHj3L_XY',
 'e7pV6Eo1I0k',
 'iz3BtgPmBU0',
 '2pQN-hJQiso',
 'w9_-SCFNeag',
 'Jg1j5Ed_zt4',
 'bgjUzhdmmF0',
 'Kg7UWnNF_rA',
 'f_HmF84G7ZY',
 'tmSzRx9RYLk',
 'm4QklgrKNCE',
 '4-3xejuhTM8',
 'LEBUgI0WeOI',
 'Uq2G-r1wpvY',
 'qWsk-kp8g7E',
 'PxeQfYWikvI',
 'D-YDEyuDxWU',
 'OIKC2ggCed4',
 'KTkW636B8xg',
 'VvMHkN3dOfU',
 'vpUb3mOhRnk',
 'zn6n8BT5_WQ',
 '8WYHDfJDPDc',
 'ZqK2As0qimM',
 '3SaVOpgSa6M']

In [22]:
for i in range(len(settt)):
    settt[i] = "youtube.com/watch?v=" + settt[i]

In [23]:
settt

['youtube.com/watch?v=KC8kOyaxX5I',
 'youtube.com/watch?v=seeYKY17780',
 'youtube.com/watch?v=UZWmtxLiiFE',
 'youtube.com/watch?v=KHPpW_ScIc8',
 'youtube.com/watch?v=DRg7cbsFJ34',
 'youtube.com/watch?v=dNtcLPDSwB0',
 'youtube.com/watch?v=omEuqt19Uhs',
 'youtube.com/watch?v=UT-dIXigLFE',
 'youtube.com/watch?v=ox7RsX1Ee34',
 'youtube.com/watch?v=0pObLdN9ICg',
 'youtube.com/watch?v=9PBZy9j3H3I',
 'youtube.com/watch?v=kg_Ne7FknWc',
 'youtube.com/watch?v=KV2ssT8lzj8',
 'youtube.com/watch?v=w6NxHj3L_XY',
 'youtube.com/watch?v=e7pV6Eo1I0k',
 'youtube.com/watch?v=iz3BtgPmBU0',
 'youtube.com/watch?v=2pQN-hJQiso',
 'youtube.com/watch?v=w9_-SCFNeag',
 'youtube.com/watch?v=Jg1j5Ed_zt4',
 'youtube.com/watch?v=bgjUzhdmmF0',
 'youtube.com/watch?v=Kg7UWnNF_rA',
 'youtube.com/watch?v=f_HmF84G7ZY',
 'youtube.com/watch?v=tmSzRx9RYLk',
 'youtube.com/watch?v=m4QklgrKNCE',
 'youtube.com/watch?v=4-3xejuhTM8',
 'youtube.com/watch?v=LEBUgI0WeOI',
 'youtube.com/watch?v=Uq2G-r1wpvY',
 'youtube.com/watch?v=qWsk-k