Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

YouTube sign algo auto update based on information provided by @patrickslin. #1208

Closed
sulge opened this issue Aug 7, 2013 · 9 comments
Closed

Comments

@sulge
Copy link

@sulge sulge commented Aug 7, 2013

# -*- coding: utf-8 -*-
import urllib2, re

def printDBG(s):
    print(s)

class CVevoSignAlgoExtractor:
    # MAX RECURSION Depth for security
    MAX_REC_DEPTH = 5

    def __init__(self):
        self.algoCache = {}
        self._cleanTmpVariables()

    def _cleanTmpVariables(self):
        self.fullAlgoCode = ''
        self.allLocalFunNamesTab = []
        self.playerData = ''

    def _jsToPy(self, jsFunBody):
        pythonFunBody = jsFunBody.replace('function', 'def').replace('{', ':\n\t').replace('}', '').replace(';', '\n\t').replace('var ', '')
        pythonFunBody = pythonFunBody.replace('.reverse()', '[::-1]')

        lines = pythonFunBody.split('\n')
        for i in range(len(lines)):
            # a.split("") -> list(a)
            match = re.search('(\w+?)\.split\(""\)', lines[i])
            if match:
                lines[i] = lines[i].replace( match.group(0), 'list(' + match.group(1)  + ')')
            # a.length -> len(a)
            match = re.search('(\w+?)\.length', lines[i])
            if match:
                lines[i] = lines[i].replace( match.group(0), 'len(' + match.group(1)  + ')')
            # a.slice(3) -> a[3:]
            match = re.search('(\w+?)\.slice\(([0-9]+?)\)', lines[i])
            if match:
                lines[i] = lines[i].replace( match.group(0), match.group(1) + ('[%s:]' % match.group(2)) )
            # a.join("") -> "".join(a)
            match = re.search('(\w+?)\.join\(("[^"]*?")\)', lines[i])
            if match:
                lines[i] = lines[i].replace( match.group(0), match.group(2) + '.join(' + match.group(1) + ')' )
        return "\n".join(lines)

    def _getLocalFunBody(self, funName):
        # get function body
        match = re.search('(function %s\([^)]+?\){[^}]+?})' % funName, self.playerData)
        if match:
            # return jsFunBody
            return match.group(1)
        return ''

    def _getAllLocalSubFunNames(self, mainFunBody):
        match = re.compile('[ =(,](\w+?)\([^)]*?\)').findall( mainFunBody )
        if len(match):
            # first item is name of main function, so omit it
            funNameTab = set( match[1:] )
            return funNameTab
        return set()

    def decryptSignature(self, s, playerUrl):
        printDBG("decrypt_signature sign_len[%d] playerUrl[%s]" % (len(s), playerUrl) )

        # clear local data
        self._cleanTmpVariables()

        # use algoCache
        if playerUrl not in self.algoCache:
            # get player HTML 5 sript
            request = urllib2.Request(playerUrl)
            try:
                self.playerData = urllib2.urlopen(request).read()
                self.playerData = self.playerData.decode('utf-8', 'ignore')
            except:
                printDBG('Unable to download playerUrl webpage')
                return ''

            # get main function name 
            match = re.search("signature=(\w+?)\([^)]\)", self.playerData)
            if match:
                mainFunName = match.group(1)
                printDBG('Main signature function name = "%s"' % mainFunName)
            else: 
                printDBG('Can not get main signature function name')
                return ''

            self._getfullAlgoCode( mainFunName )

            # wrap all local algo function into one function extractedSignatureAlgo()
            algoLines = self.fullAlgoCode.split('\n')
            for i in range(len(algoLines)):
                algoLines[i] = '\t' + algoLines[i]
            self.fullAlgoCode  = 'def extractedSignatureAlgo(param):'
            self.fullAlgoCode += '\n'.join(algoLines)
            self.fullAlgoCode += '\n\treturn %s(param)' % mainFunName
            self.fullAlgoCode += '\noutSignature = extractedSignatureAlgo( inSignature )\n'

            # after this function we should have all needed code in self.fullAlgoCode

            printDBG( "---------------------------------------" )
            printDBG( "|    ALGO FOR SIGNATURE DECRYPTION    |" )
            printDBG( "---------------------------------------" )
            printDBG( self.fullAlgoCode                         )
            printDBG( "---------------------------------------" )

            try:
                algoCodeObj = compile(self.fullAlgoCode, '', 'exec')
            except:
                printDBG('decryptSignature compile algo code EXCEPTION')
                return ''
        else:
            # get algoCodeObj from algoCache
            printDBG('Algo taken from cache')
            algoCodeObj = self.algoCache[playerUrl]

        # for security alow only flew python global function in algo code
        vGlobals = {"__builtins__": None, 'len': len, 'list': list}

        # local variable to pass encrypted sign and get decrypted sign
        vLocals = { 'inSignature': s, 'outSignature': '' }

        # execute prepared code
        try:
            exec( algoCodeObj, vGlobals, vLocals )
        except:
            printDBG('decryptSignature exec code EXCEPTION')
            return ''

        printDBG('Decrypted signature = [%s]' % vLocals['outSignature'])
        # if algo seems ok and not in cache, add it to cache
        if playerUrl not in self.algoCache and '' != vLocals['outSignature']:
            printDBG('Algo from player [%s] added to cache' % playerUrl)
            self.algoCache[playerUrl] = algoCodeObj

        # free not needed data
        self._cleanTmpVariables()

        return vLocals['outSignature']

    # Note, this method is using a recursion
    def _getfullAlgoCode( self, mainFunName, recDepth = 0 ):
        if self.MAX_REC_DEPTH <= recDepth:
            printDBG('_getfullAlgoCode: Maximum recursion depth exceeded')
            return 

        funBody = self._getLocalFunBody( mainFunName )
        if '' != funBody:
            funNames = self._getAllLocalSubFunNames(funBody)
            if len(funNames):
                for funName in funNames:
                    if funName not in self.allLocalFunNamesTab:
                        self.allLocalFunNamesTab.append(funName)
                        printDBG("Add local function %s to known functions" % mainFunName)
                        self._getfullAlgoCode( funName, recDepth + 1 )

            # conver code from javascript to python 
            funBody = self._jsToPy(funBody)
            self.fullAlgoCode += '\n' + funBody + '\n'
        return

EXAMPLE OF USE

sig = 'E911918ECB1C18AED66D42DD2B9D0FB1AEF8999E812.FE13EF2964BE256F8C6E3F6103F1DBE70C1FB5A8A66'
sig2 = 'FE13EF2964BE256F8C6E3F6103F1DBE70C1FB5A8A66.E911918ECB1C18AED66D42DD2B9D0FB1AEF8999E812'


Obj = CVevoSignAlgoExtractor()
Obj.decryptSignature(sig, 'http://s.ytimg.com/yts/jsbin/html5player-vflM2EmfJ.js')
Obj.decryptSignature(sig2, 'http://s.ytimg.com/yts/jsbin/html5player-vflM2EmfJ.js')
Obj.decryptSignature(sig, 'http://s.ytimg.com/yts/jsbin/html5player-vflRFcHMl.js')
@firmanelhakim
Copy link

@firmanelhakim firmanelhakim commented Aug 9, 2013

Thank you! Now it's fully automatic 👍

@phihag
Copy link
Contributor

@phihag phihag commented Aug 11, 2013

@sulge Why have you closed this issue and deleted your code? I'd like to integrate it into mainline youtube-dl. Can you elaborate? Or has your account been hacked? In that case, i'll just restore the messages.

@phihag phihag reopened this Aug 11, 2013
@sulge
Copy link
Author

@sulge sulge commented Aug 12, 2013

I thought you're not interested. Of course, you can use this code.

@andrewhilson
Copy link

@andrewhilson andrewhilson commented Aug 30, 2013

@phihag any chance this can be reviewed and possibly implemented?

@phihag
Copy link
Contributor

@phihag phihag commented Aug 30, 2013

@andrewhilson In this form, the code is not that useful (it contains lots of prints, is unrelated to youtube-dl's actual inner workings, is stylistically not nice, doesn't run on Python 3, is not in the proper form of a pull request, doesn't do proper error handling, poses a potential security risk, unnecessarily constructs Python source code and contains unhelpful comments). I'm currently writing up a similar approach.

@zincli
Copy link

@zincli zincli commented Sep 2, 2013

hey @sulge , how can I find the html5player-XX.js programatically, some video does not even have an html5 version page

@AWainb
Copy link

@AWainb AWainb commented Sep 5, 2013

@zincli, I came across this post while I was searching for information/solution to the "s=" issue for a different project. Although I have not yet had a chance to fully test the results when compiling the full video url, I was able to find the link to the html5player...js file from within the source code of the video's "watch" link (eg: http://www.youtube.com/watch?v=). There appears to be only one.js file in each video page which follows the pattern of the urls that are used within the "Examples of Use" section above. Like you, I had to guess at it, and as I have not yet tested the results of the fully compiled video url, I'd say give the below code a run to at least get the js file and see if it works with your urls.

One awkward thing I noticed while testing this is that so far all my urls are all returning the exact same js file (http://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js) each time. I don't believe this is a problem, but it's at least worth mentioning

import re, urllib2

#def js_by_url(url):
#     cache = urllib2.urlopen(url)

def js_by_id(video_id):
    # Replace with your preferred connection method, only need the page source #
    cache = urllib2.urlopen("http://www.youtube.com/watch?v=%s" % (video_id))
    source = cache.read()
    cache.close()
    # Simple pattern match to parse the *.js link from the video page #
    match = re.search('"js": "(.*?)"', source)
    # Return result if match is found. If not, return None #
    if match:
        # Remove escapes (\) from parsed url #
        return match.group(1).replace("\\", "")
    return None

print yt_html5_js("PmTz7oYrKTs")
@E-FL
Copy link

@E-FL E-FL commented Sep 11, 2013

Hi guys,
I had the same issue with the signature and solved it my own way (..but similar solution)
Looks like the javascript is replaced from time to time, do you know how long this file is kept 'fresh'?

@plfort plfort mentioned this issue Sep 21, 2013
0 of 4 tasks complete
@jaimeMF
Copy link
Collaborator

@jaimeMF jaimeMF commented Oct 6, 2013

This has been solved with #1481, thanks for your contributions.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Linked pull requests

Successfully merging a pull request may close this issue.

8 participants
You can’t perform that action at this time.