In [None]:
import requests
import json
import re
from tqdm import tqdm
from urllib.parse import unquote # decode percent-encoded (e.g. '%26' -> '&')
import subprocess
import os

In [None]:
content_url = 'https://www.youtube.com/watch?v=Opp9nqiN5m0'
# content_url = 'https://www.youtube.com/watch?v=NyUTYwZe_l4'
save_dir = r'D:\Videos'
save_filename = None # filename without extension. if set to None here, the title of content is used

In [None]:
http_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
http_referer = 'https://www.youtube.com/'

session = requests.Session()
session.headers.update({
    'User-Agent': http_ua,
    'Referer':    http_referer
})

def legalized_filename(filename):
    quote_pattern = r'"([^"]*)"'
    quote_replace = lambda m: f"“{m.groups()[0]}”"
    filename = re.sub(quote_pattern, quote_replace, filename)
    return filename.replace('\n', '') \
            .replace('/', '／').replace('\\', '＼').replace('?', '？') \
            .replace('|', '｜').replace(':', '：') \
            .replace('"', '“').replace('*', '＊')

In [None]:
def params_str_to_dict(p: str):
    items = p.split('&')
    items = [item.split('=', maxsplit=1) for item in items]
    return {k : v for k, v in items}

def yt_convert_param_n(n):
    assert(len(n) == 16)
    a = [ord(ch) for ch in n]

    def convert(a, table_chars, seed):
        m = [ord(ch) for ch in table_chars]
        b = [m.index(ord(ch)) for ch in seed]
        a = [m.index(x) for x in a]
        for i in range(len(a)):
            a[i] = (a[i] - b[i] + 64) % 64
            b.append(a[i])
        return [m[idx] for idx in a]

    shuffle = [12, 3, 2, 1, 14, 0, 13, 4, 11, 10, 9, 7, 6, 5]
    a = [a[idx] for idx in shuffle]
    a = convert(a, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_", "QoGvzh")
    shuffle = [3, 9, 7, 0, 6, 5, 4, 13, 2, 1, 11, 10, 12, 8]
    a = [a[idx] for idx in shuffle]
    return "".join([chr(ch) for ch in a])


def yt_convert_param_sig(sig):
    a = list(unquote(sig))    

    def reverse(a): a.reverse(); return a
    def swap(a, i): a[0], a[i] = a[i], a[0]; return a
    def pop(a, i):  return a[i:]
    ops = [
        (reverse,),
        (swap, 57),
        (pop, 3),
        (reverse,),
        (pop, 3),
        (swap, 20)
    ]
    for op in ops:
        f, *arg = op
        a = f(a, *arg)
        
    return ''.join(a)


def download_stream(stream, path):
    if 'url' in stream:
        url = stream['url']
        url = unquote(url.replace(r'\u0026', '&'))
        url, params_str = url.split('?')
        params = params_str_to_dict(unquote(params_str))
    else:
        assert('signatureCipher' in stream)
        sig_params = params_str_to_dict(stream['signatureCipher'].replace(r'\u0026', '&'))
        url = unquote(sig_params.pop('url'))
        url, params_str = url.split('?')
        params = params_str_to_dict(unquote(params_str))
        params[sig_params['sp']] = yt_convert_param_sig(sig_params['s'])

    params['n'] = yt_convert_param_n(params['n'])

    BLOCK_SIZE = 4 << 20  # 2 MiB
    total_size = int(stream['contentLength'])

    with open(path, 'wb') as f:
        with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
            start = 0
            while start < total_size:
                end = min(start + BLOCK_SIZE, total_size)
                params['range'] = f"{start}-{end-1}"
                r = session.post(url, params=params, data=b'x\x00', stream=True)
                for chunk in r.iter_content(chunk_size=4096):
                    f.write(chunk)
                    pbar.update(len(chunk))
                start = end

In [None]:
r = session.get(content_url)
content_html = r.text
m = re.search(r'var ytInitialPlayerResponse\s*=\s*{', content_html)
info_start_pos = m.span()[1] - 1
info_last_pos = content_html.find('};', info_start_pos)
info = json.loads(content_html[info_start_pos:info_last_pos+1])

if save_filename is None:
    content_title = eval(f"'''{info['videoDetails']['title']}'''")
    save_filename = legalized_filename(content_title)
    print(save_filename)

In [None]:
streams = info['streamingData']['adaptiveFormats']
video_streams = [st for st in streams if 'video' in st['mimeType']]
audio_streams = [st for st in streams if 'audio' in st['mimeType']]
assert(len(video_streams) + len(audio_streams) == len(streams))

def video_cmp_key(v):
    resolution = v['width'] * v['height']
    return resolution, v['fps'], 'vp9' in v['mimeType']

def audio_cmp_key(a):
    quality_map = {'AUDIO_QUALITY_LOW': 1, 'AUDIO_QUALITY_MEDIUM': 4}
    return a['audioSampleRate'], quality_map[a['audioQuality']], 'webm' in a['mimeType'], a['bitrate']

def video_desc(v):
    return f"{v['qualityLabel']} {v['width']}x{v['height']}@{v['fps']}fps  {v['mimeType']}"

def audio_desc(a):
    return f"{a['audioSampleRate']}Hz {a['bitrate']/1e3:.1f}Kbps  {a['mimeType']}"

video = max(video_streams, key=video_cmp_key)
audio = max(audio_streams, key=audio_cmp_key)
# for v in video_streams:
#     print('video:', video_desc(v))
# for a in audio_streams:
#     print('audio:', audio_desc(a))
print('select video:', video_desc(video))
print('select audio:', audio_desc(audio))

In [None]:
video_path = os.path.join(save_dir, save_filename + '-video.webm')
audio_path = os.path.join(save_dir, save_filename + '-audio.webm')
download_stream(video, video_path)
download_stream(audio, audio_path)

In [None]:
ret = subprocess.run([
    'ffmpeg',
    '-i', video_path,
    '-i', audio_path,
    '-c', 'copy',
    '-y',
    os.path.join(save_dir, save_filename + '.webm')
])

assert(ret.returncode == 0)
os.remove(video_path)
os.remove(audio_path)