Skip to content

Commit

Permalink
[sites:youtube] Updated extraction regex to allow for }; in data (a…
Browse files Browse the repository at this point in the history
…long with corresponding tests)
  • Loading branch information
xenova committed Jul 4, 2021
1 parent a854e80 commit 69e4ecb
Showing 1 changed file with 34 additions and 2 deletions.
36 changes: 34 additions & 2 deletions chat_downloader/sites/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,43 @@ def __init__(self, **kwargs):
'expected_result': {
'error': VideoUnplayable,
}
},

# Potential parsing errors
{
'name': "Parsing error with '};' inside yt initial data (1)",
'params': {
'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
},
'expected_result': {
'error': NoChatReplay,
}
},
{
'name': "Parsing error with '};' inside yt initial data (2)",
'params': {
'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
},
'expected_result': {
'error': NoChatReplay,
}
},
{
'name': 'Title with JS-like syntax "};"',
'params': {
'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
},
'expected_result': {
'error': NoChatReplay,
}
}
]

_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'\s*(?:var\s+meta|</script|\n)'
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + \
_YT_INITIAL_BOUNDARY_RE
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' + \
_YT_INITIAL_BOUNDARY_RE
_YT_CFG_RE = r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;'

_YT_HOME = 'https://www.youtube.com'
Expand Down

0 comments on commit 69e4ecb

Please sign in to comment.