In [37]:
### Goal: Scraping function names from Python Documentation, using Regex to refine the results
### Pass the refined datapoints to a DataFrame and save to a CSV file
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

### Extracting HTML from Python Documentation

page = urllib.request.urlopen("https://docs.python.org/3/library/random.html")
soup = bs(page)
print (soup)

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/><meta content="Docutils 0.17.1: http://docutils.sourceforge.net/" name="generator"/>
<meta content="random — Generate pseudo-random numbers" property="og:title"/>
<meta content="website" property="og:type"/>
<meta content="https://docs.python.org/3/library/random.html" property="og:url"/>
<meta content="Python documentation" property="og:site_name"/>
<meta content="Source code: Lib/random.py This module implements pseudo-random number generators for various distributions. For integers, there is uniform selection from a range. For sequences, there is uniform s..." property="og:description"/>
<meta content="https://docs.python.org/3/_static/og-image.png" property="og:image"/>
<meta content="Python documentation" property="og:image:alt"/>
<meta content="Source code: Lib/random.py This module implements pseudo-random number generators for various distributions. For i

In [35]:
#This extracts the entire HTML source code for the requested website.
#But we want function names. So we inspect source code and find that functions are wriiten inside <dt></dt>

names = soup.body.findAll('dt')

#Using findall to extract patterns
function_names = re.findall('id="random.\w+', str(names))

#Cleaning up the extra double quote
function_names=[item.replace('"','') for item in function_names]

function_names=[item[3:] for item in function_names]

print (function_names)

#Extracting Function descriptions using <dd>
description=soup.body.findAll('dd')
function_usage=[]

for item in description:
    item=item.text
    item=item.replace('\n','') #Remove the \n line break
    function_usage.append(item)

print (function_names)
print (function_usage)

#Verifying that lists are both of the same length

print(len(function_names))
print(len(function_usage))

['random.seed', 'random.getstate', 'random.setstate', 'random.randbytes', 'random.randrange', 'random.randint', 'random.getrandbits', 'random.choice', 'random.choices', 'random.shuffle', 'random.sample', 'random.random', 'random.uniform', 'random.triangular', 'random.betavariate', 'random.expovariate', 'random.gammavariate', 'random.gauss', 'random.lognormvariate', 'random.normalvariate', 'random.vonmisesvariate', 'random.paretovariate', 'random.weibullvariate', 'random.Random', 'random.SystemRandom']
['random.seed', 'random.getstate', 'random.setstate', 'random.randbytes', 'random.randrange', 'random.randint', 'random.getrandbits', 'random.choice', 'random.choices', 'random.shuffle', 'random.sample', 'random.random', 'random.uniform', 'random.triangular', 'random.betavariate', 'random.expovariate', 'random.gammavariate', 'random.gauss', 'random.lognormvariate', 'random.normalvariate', 'random.vonmisesvariate', 'random.paretovariate', 'random.weibullvariate', 'random.Random', 'random.S

In [36]:
#Creating the dataframe

data=pd.DataFrame({'function name': function_names, 'function use':function_usage})
data


Unnamed: 0,function name,function use
0,random.seed,Initialize the random number generator.If a is...
1,random.getstate,Return an object capturing the current interna...
2,random.setstate,state should have been obtained from a previou...
3,random.randbytes,Generate n random bytes.This method should not...
4,random.randrange,Return a randomly selected element from range(...
5,random.randint,Return a random integer N such that a <= N <= ...
6,random.getrandbits,Returns a non-negative Python integer with k r...
7,random.choice,Return a random element from the non-empty seq...
8,random.choices,Return a k sized list of elements chosen from ...
9,random.shuffle,Shuffle the sequence x in place.To shuffle an ...


In [39]:
data.to_csv('pythondoc.csv')