In [13]:
import requests  # Import requests to fetch web pages
from bs4 import BeautifulSoup as bs  # Import BeautifulSoup for parsing HTML
import re  # Import regex for pattern matching
import pandas as pd  # Import pandas for data handling

# Extract and Store Function Names & Usage
<br>
From the Python Documentation - Random Functions webpage: https://docs.python.org/3/library/random.html


In [18]:
# Load HTML code from a URL
page = requests.get("https://docs.python.org/3/library/random.html")  # Fetch webpage content
soup = bs(page.text, "html.parser")  # Parse the HTML content correctly

# Find all function names in the HTML page
names = soup.body.find_all('dt')  # Locate all <dt> tags, which contain function names
function_names = re.findall(r'id="random.\w+', str(names))  # Extract function names using regex
function_names = [item[4:] for item in function_names]  # Remove 'id=' prefix from names

# Find all function descriptions in the HTML page
description = soup.body.find_all('dd')  # Locate all <dd> tags, which contain descriptions
function_usage = []  # Initialize an empty list to store function descriptions

for item in description:
    item = item.text.strip()  # Extract text content from the tag and remove extra spaces
    item = item.replace('\n', ' ')  # Remove newline characters for cleaner text
    function_usage.append(item)  # Append cleaned description to the list
    
min_length = min(len(function_names), len(function_usage))
function_names, function_usage = function_names[:min_length], function_usage[:min_length]

# Print sample results
print('List of function names:', function_names[:5])  # Display first 5 function names
print('\nFunction description:', function_usage[0] if function_usage else "No description found")  # Display first function description safely
print('\nNumber of function names:', len(function_names))  # Count function names
print('Number of function descriptions:', len(function_usage))  # Count descriptions


# Ensure both lists have the same length




List of function names: ['random.seed', 'random.getstate', 'random.setstate', 'random.randbytes', 'random.randrange']

Function description: Initialize the random number generator. If a is omitted or None, the current system time is used.  If randomness sources are provided by the operating system, they are used instead of the system time (see the os.urandom() function for details on availability). If a is an int, it is used directly. With version 2 (the default), a str, bytes, or bytearray object gets converted to an int and all of its bits are used. With version 1 (provided for reproducing random sequences from older versions of Python), the algorithm for str and bytes generates a narrower range of seeds.  Changed in version 3.2: Moved to the version 2 scheme which uses all of the bits in a string seed.   Changed in version 3.11: The seed must be one of the following types: None, int, float, str, bytes, or bytearray.

Number of function names: 31
Number of function descriptions: 31


# Store Data inside a DataFrame
<br>
After ensuring the lenghts of both lists match

In [21]:
#create a dataframe
data = pd.DataFrame({'function name': function_names, 'function usage': function_usage})
data.index = range(1, len(data) + 1)  # Start index from 1 instead of 0
data.head() # Display the first few rows of the dataframe

Unnamed: 0,function name,function usage
1,random.seed,Initialize the random number generator. If a i...
2,random.getstate,Return an object capturing the current interna...
3,random.setstate,state should have been obtained from a previou...
4,random.randbytes,Generate n random bytes. This method should no...
5,random.randrange,Return a randomly selected element from range(...


In [20]:
#Target specific attributes
example = soup.body.find_all('div', attrs={'id': 'bookkeeping-functions'})  # Locate the <div> tag with id 'bookkeeping-functions'

# Export Data into a csv file
<br>
The file will be saved in the same directory as the notebook

In [22]:
# Save data as a CSV file
data.to_csv('my_file.csv')