# Docstring Extractor

I need to build a function that, when given a python file, 
returns all pairs of docstrings and functions they came from.

In [191]:
import re
from glob import glob
from IPython.display import HTML

In [16]:
py_files = glob('D:/data/documentation-generator/py_files/*.py')
print(f'Python Files: {len(py_files)}')

Python Files: 12169


In [17]:
py_file = 'D:/data/documentation-generator/py_files/Crypto.py'

with open(py_file, 'r') as file:
    data = file.read()
    file.close()
    
print(data)

"""
Django's standard crypto functions and utilities.
"""
import hashlib
import hmac
import random
import time

from django.conf import settings
from django.utils.encoding import force_bytes

# Use the system PRNG if possible
try:
    random = random.SystemRandom()
    using_sysrandom = True
except NotImplementedError:
                  'on your system. Falling back to Mersenne Twister.')
    using_sysrandom = False


def salted_hmac(key_salt, value, secret=None):
    """
    Return the HMAC-SHA1 of 'value', using a key generated from key_salt and a
    secret (which defaults to settings.SECRET_KEY).

    A different key_salt should be passed in for every application of HMAC.
    """
    if secret is None:
        secret = settings.SECRET_KEY

    key_salt = force_bytes(key_salt)
    secret = force_bytes(secret)

    # We need to generate a derived key from our base key.  We can do this by
    # passing the key_salt and our base key through a pseudo-random function and
    # SHA1 works

Using `ast` to parse the python file. This is handy for extracting the function names, as well as the docstrings .

Once we extract the function names, we can use regex-like code to extract the function definition

In [171]:
parsed_data = ast.parse(data)

In [172]:
function_definitions = [node for node in parsed_data.body if isinstance(node, ast.FunctionDef)]

In [178]:
# function names
function_names = [f.name for f in function_definitions]
function_docstrings = [ast.get_docstring(f) for f in function_definitions]

0 salted_hmac Return the HMAC-SHA1 of 'value', using a key generated from key_salt and a
secret (which defaults to settings.SECRET_KEY).

A different key_salt should be passed in for every application of HMAC.
1 get_random_string Return a securely generated random string.

The default length of 12 with the a-z, A-Z, 0-9 character set returns
a 71-bit value. log_2((26+26+10)^12) =~ 71 bits
2 constant_time_compare Return True if the two strings are equal, False otherwise.
3 pbkdf2 Return the hash of password using pbkdf2.


In [177]:
def function_extractor(pyfile: str, function_name: str):
    """
    given a function name, returns the function from pyfile in the form of a string.
    Note: This removes the docstring
    
    :param pyfile: python file to search for function_name in.
    :param function_name: the function you are searching for. 
    :returns: python function as a string.
    """
    
    init_def = pyfile.find(f'def {function_name}')
    function = pyfile[
        init_def : pyfile.find('\n', pyfile.find('return ', init_def))
    ]
    
    doc_start = function.find('\"\"\"')
    doc_end = function.find('\"\"\"', doc_start + 1) + 3
    function = function[0: doc_start:] + function[doc_end::]
    
    return function

In [169]:
get_random_string = function_extractor(data, 'get_random_string')
print(get_random_string)

def get_random_string(length=12,
                      allowed_chars='abcdefghijklmnopqrstuvwxyz'
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'):
    
    if not using_sysrandom:
        # This is ugly, and a hack, but it makes things better than
        # the alternative of predictability. This re-seeds the PRNG
        # using a value that is hard for an attacker to predict, every
        # time a random string is required. This may change the
        # properties of the chosen random sequence slightly, but this
        # is better than absolute predictability.
        random.seed(
            hashlib.sha256(
                ('%s%s%s' % (random.getstate(), time.time(), settings.SECRET_KEY)).encode()
            ).digest()
        )
    return ''.join(random.choice(allowed_chars) for i in range(length))


In [194]:
function_dataset = []

for i, (fname, docstring) in enumerate(zip(function_names, function_docstrings)):
    function_def = function_extractor(data, fname)
    function_dataset.append([function_def, docstring])
    
print('EXAMPLE\n')
print(f'Input Function:\n{function_dataset[0][0]}\n')
print(f'Target Docstring:\n{function_dataset[0][1]}')

EXAMPLE

Input Function:
def salted_hmac(key_salt, value, secret=None):
    
    if secret is None:
        secret = settings.SECRET_KEY

    key_salt = force_bytes(key_salt)
    secret = force_bytes(secret)

    # We need to generate a derived key from our base key.  We can do this by
    # passing the key_salt and our base key through a pseudo-random function and
    # SHA1 works nicely.
    key = hashlib.sha1(key_salt + secret).digest()

    # If len(key_salt + secret) > sha_constructor().block_size, the above
    # line is redundant and could be replaced by key = key_salt + secret, since
    # the hmac module does the same thing for keys longer than the block size.
    # However, we need to ensure that we *always* do this.
    return hmac.new(key, msg=force_bytes(value), digestmod=hashlib.sha1)

Target Docstring:
Return the HMAC-SHA1 of 'value', using a key generated from key_salt and a
secret (which defaults to settings.SECRET_KEY).

A different key_salt should be passed in for ev

handling for classes

In [98]:
class_definitions = [node for node in parsed_data.body if isinstance(node, ast.ClassDef)]
method_definitions = []

for class_def in class_definitions:
    method_definitions.append([node for node in class_def if isinstance(node, ast.FunctionDef)])