In [26]:
# Forget why I need this. Folder creation?
import os 

 # Used to parse webpages
import requests               as req
from bs4 import BeautifulSoup as bs4
from bs4 import element       as bel
import re

# Used for asynchronous webpage fetching
import concurrent.futures as cof 
from requests_futures.sessions import FuturesSession

# Used for a good-looking display/diagnostic system 
from IPython import display as dis
import ipywidgets as ipw

# Used for delays and tracking execution time
import time

# Used for some very minor string processing
from string import printable

# Used to send connection over tor
# Not needed right now
# import stem





In [2]:
%load_ext pycodestyle_magic

ModuleNotFoundError: No module named 'pycodestyle_magic'

## Method Definitions

In [2]:
class regexes:
    """a container object for compiled regexes used in matIDfetch """
    tagloc  = re.compile(r'.*Replacing.*')
    matguid = re.compile(r'matguid=([0-9a-f]{32})') 


class button_grid:
    """
    A object that draws a grid of un-clickable buttons, which can be set to
    different colors. It is used to display the the status of a batch of
    pages in the download quene as they update and complete. NOTE: there is
    no seperate function to draw the grid. It draws itself on instantiation
    within the currently selected cell.

    Args:
        iterator (iterator): Used to create the buttons, and give them
        appropriate labels. These same labels are used as keys to update
        the buttons, so they must be suitable for this purpouse.

    Attributes:
        buttonlist (dict): Used to store the objects that reference the buttons

    """
    def __init__(self, iterator):

        self.buttonlist = dict()
        for i in iterator:
            newbut = ipw.Button(value=True,
                                description=str(i),
                                layout=ipw.Layout(width='42px', height='25px'),
                                disabled=True)
            self.buttonlist[i] = newbut
        container = ipw.Box(tuple(self.buttonlist.values()))
        container.layout.flex_flow = "row wrap"
        dis.display(container)

    def setval(self, index, value):
        """
        A method required by all display systems. Responsible for
        setting the item at index to value. In this case, index
        comes from the iterator, and value is any css specified
        color keyword.
        
        Args:
            index (int): Technicaly any hashable, but I only used
            integers. The index of the button in the button grid
            you want to change the color of. 
            
            value (string): The css specified color you want to 
            set the selected button to.
        """
        self.buttonlist[index].style.button_color = value



In [3]:
# The following 2 methods are legacy code.
# They both are replaced by the pageprocess method below.

# ----Extracts the list of materials and ids from the webpage.----
def MatWebTableFetch(thispage, pagenum, display=None):
    """
    Extract the names assigned to each material on matweb.com, as
    well as the hexadecimal identifier codes used to refer to them
    from individual pages in the index.
    
    Args:
        thispage (requests.Response): A response object containing
        one of the index pages, from which the links will be extracted
        
        pagenum (int): A index coresponding to the page number, and
        its location in the selected display object
        
        display (object): A object that implements the setval(index, value)
        method. Used to show status updates/when a page has been downloaded
        successfuly.
    
    Raises:
        Sets the item at location pagenum on the display to maroon, if it
        detects a bad status code on the fetched webpage.
    """
    if (thispage.status_code != 200 and display):
        display.setval(pagenum, 'maroon')
        display = None

    PageHTML = bs4(thispage.text, 'html.parser')
    # WARN
    # If a error gets thrown here, it's because I changed the regex object
    # to be static. That's defined up at line 1 of this cell.
    Tagloc = PageHTML.find_all(text=regexes.tagloc)
    if len(Tagloc) != 1:
        # TODO
        # Maybe this, and the one on line 62 should throw exceptions
        # instead?
        print(f"Did not find single tag locator."
              f" Found {len(Tagloc)} of them in p.{pagenum}")
    Tagloc = Tagloc[0]
    FetchedTable = Tagloc.find_all_next('ul')
    if len(FetchedTable) != 1:
        print(f"Did not find a single table."
              f" Found {len(Tagloc)} of them in p.{pagenum}")
    FetchedTable = FetchedTable[0]
    return display, FetchedTable.find_all("li")


# ----Extracts the entries from the HTML formatted list----
def GetMatNameGUID(lisctoflitags, pagenum, display=None):
    """
    
    """
    outputdict = dict()
    for item in listoflitags:
        # The URL contains a matguid value. This must be extracted
        extractedURL = str(item.a["href"])
        GUIDstring = extractedURL.split("matguid=", 1)[1]
        extractedGUID = int(GUIDstring, 16)
        # The material name is formatted using <sub> tags.
        # These must be removed. I replace them with LaTeX syntax.
        matname = str()
        for child in item.a.children:
            if type(child) is bel.NavigableString:
                matname += str(child)
            if type(child) is bel.Tag:
                if child.name == 'sub':
                    workingtag = str(child)
                    # WARN
                    # Broke up a line to adhere to PEP8. Did not test. If
                    # you get html <sub> tags in the materialdump.txt file,
                    # these three lines are what is causing the problem.
                    workingtag.replace('<sub>', '_{')
                    workingtag.replace('</sub>', '}')
                    matname += workingtag
        outputdict[extractedGUID] = matname

    if display and len(outputdict) == 0:
        display.setval(pagenum, 'purple')
        display = None
    elif display and len(outputdict) < 250:
        display.setval(pagenum, 'fuchsia')
        display = None

    return display, outputdict

In [75]:
# ----Takes a webpage, and makes a dict of material GUIDs and names from it----


def pageprocess(thispage: req.Response, *args, display=None, **kwargs):
    """
    A single, compact function that fully extracts matUUID's and material names
    from the provided page.
    
    Args:
        thispage (req.Response): A response object from which to extract
        matUUIDs and material names
        
        display: A object able to indicate progress of the page processing
    
    """
    matdict = dict()
    pagesoup = bs4(thispage.text, 'html.parser')
    selection = pagesoup.select("body > form > div > ul > li > a")
    
    
    def Rstringify(tag):
        """
        A recursive function to convert html sub tags into latex format
        
        Args:
            tag: A tag to be recursively expanted into latex
        
        Raises:
            ValueError: If it encounters something other than a sub tag
        
        Returns:
            String, with no HTML tags in it.
        
        """
        workingstring = ""
        for item in tag.children:
            if type(item) is bel.NavigableString:
                workingstring += str(item)
            if type(item) is bel.Tag:
                if item.name == 'sub':
                    workingstring+='_{'
                    workingstring+=Rstringify(item)
                    workingstring+='}'
                elif item.name == 'sup':
                    workingstring+='^{'
                    workingstring+=Rstringify(item)
                    workingstring+='}'
                
                elif item.name == 'font':
                    workingstring+=Rstringify(item)
                else:
                    errorlist = (str(item),thispage.url)
                    raise ValueError(
                        """
                        Don't know how to process tag
                        {0}
                        for page
                        {1}
                        """.format(*errorlist)
                    )
        return workingstring
    
    for entry in selection:
        guid_re = regexes.matguid.search(entry['href'])
        guid = int(guid_re[1],16)
        name = Rstringify(entry)
        matdict[guid] = name
    return matdict

In [32]:
def list_of_pages (allmats="http://www.matweb.com/search/GetAllMatls.aspx"):
    """
    Accumulates the URLS of all pages that contain individual materials.
    Returns them all in a list
    
    Args:
       allmats: the url of the page that contains all page links. 
    """
    index_page = req.get(allmats)
    if index_page.status_code != 200:
        raise ValueError("Failed to fetch page {0}!".format(allmats))
    index_soup = bs4(index_page.text,'html.parser')
    selection = index_soup.select("body > form > div > ul > li > a")
    prefix = allmats.rstrip(printable.replace('/',''))
    return [prefix+sel_tag['href'] for sel_tag in selection]
    

In [6]:

class request_batch:
    def __init__(self, url_list, Executor=None, session=req.Session(),
                 batchsize=5, workers=None, batch_wait=0,
                 wait_on=cof.ALL_COMPLETED):

        self.workers = batchsize if workers is None else workers

        if Executor is None:
            self.Executor = cof.ProcessPoolExecutor(max_workers=self.workers)
        else:
            self.Executor = Executor

        self.session = session
        self.url_list = url_list
        self.batchsize = batchsize
        self.batch_wait = batch_wait
        self.wait_on = wait_on

        self.fsession = FuturesSession(executor=self.Executor,
                                       session=self.session)

        self.futures = list()

    def run(self):
        url_list = self.url_list[:]
        while len(url_list):
            # Quene up the urls
            self.futures += [self.fsession.get(url) for url
                             in url_list[:self.batchsize]]
            # Remove already quened urls
            if len(url_list) < self.batchsize:
                url_list = list()
            else:
                url_list = url_list[self.batchsize:]

            # Waiting for wait_on, then aditionaly wait batch_wait seconds
            cof.wait(self.futures, return_when=self.wait_on)
            time.sleep(self.batch_wait)

In [74]:
# Stolen from the requests-futures examples list. 
# Probably unused, after moving pageprocess handling out of futures loop.

class ElapsedFuturesSession(FuturesSession):
    
    def __init__(self,*args,**kwargs):
        self.timings = dict()
        self.hooks = kwargs.pop("hooks", {})
        super().__init__(*args,**kwargs)
        

    def request(self, method, url, hooks={}, *args, **kwargs):
        starttime = time.perf_counter_ns()

        def timing(r, *args, **kwargs):
            r.starttime  = starttime
            r.cpuendtime = time.perf_counter_ns()
        try:
            if isinstance(hooks['response'], (list, tuple)):
                # needs to be first so we don't time other hooks execution
                hooks['response'].insert(0, timing)
            else:
                hooks['response'] = [timing, hooks['response']]
        except KeyError:
            hooks['response'] = timing
            self.hooks = hooks
        

        return super(ElapsedFuturesSession, self) \
            .request(method, url, *args, **kwargs)


## Setup and execution

### Fetching material URLS

In [33]:
list_of_pages()

['http://www.matweb.com/search/GetAllMatls.aspx?p=1',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=2',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=3',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=4',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=5',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=6',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=7',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=8',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=9',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=10',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=11',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=12',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=13',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=14',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=15',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=16',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=17',
 'http://www.matweb.com/search/GetAllMatls.aspx?p=18',
 'http://www.matweb

In [69]:


session = FuturesSession(max_workers=10)
threadlist = list()
matdict = dict()
timinglist = dict()

downlink = request_batch(list_of_pages()[0:5])
print("Starting at:", time.asctime(), "Perf Timer is:", time.perf_counter_ns())
downlink.run()
print("Loaded pages at:",time.asctime())
results = [item.result for item in downlink.futures]



Starting at: Sat Jul 27 22:16:37 2019 Perf Timer is: 323157201446872
Loaded pages at: Sat Jul 27 22:16:48 2019


In [70]:
dir(results[0]())

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 'apparent_encoding',
 'close',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [76]:
pageprocess(results[0]())

{308299377034770498966218527841843447465: '1,1,1 Trichloroethane (CH_{3}CCl_{3})',
 185846970040132891333945369232468432247: '1,1,1,2-Tetrabromoethane (acetylene tetrabromide), C_{2}H_{2}Br_{4}',
 132072966350615208431913507885923642200: '1,1,2,2-Tetrabromoethane (acetylene tetrabromide), C_{2}H_{2}Br_{4}',
 160911891841013759395956773677888732433: '1,1,2,2-Tetrachloroethane, C_{2}H_{2}Cl_{4}',
 295655230884376365629535121793886344451: '1,1,2,2-Tetrachloroethylene, C_{2}Cl_{4}',
 306767293827407123486817910621908801232: '1,1,2-Trichloroethane (Vinyl trichloride), C_{2}H_{3}Cl_{3}',
 106749951651857141379938782543465780123: '1,1,2-Trichlorotrifluoroethane, C_{2}Cl_{3}F_{3}',
 35790860161642593411824697696055118808: '1,1,2-Trichlorotrifluoro-ethane, C_{2}Cl_{3}F_{3}',
 204391087400063905542817466690080352976: '1,1-Dichloroethane (CH_{3}CHCl_{2})',
 278053408085766456088263818112602717785: '1,2,3-Trichloropropane, C_{3}H_{5}Cl_{3}',
 75155025159734338017108011637407132235: '1,2,4-Trichlor

In [38]:

matdict = dict()
threadlist = list()
retry = list()
targetpages = retry # + list(range(500,523))

#341, 

allmatsURL = "http://www.matweb.com/search/GetAllMatls.aspx"
DownloadPool = FuturesSession(max_workers=13)
progress = button_grid(targetpages)


# Submit requests
for page in targetpages:
    threadlist.append(DownloadPool.get("http://www.matweb.com/search/GetAllMatls.aspx",
                      params={"p":page},timeout=40))
    threadlist[-1].page = page
            
        
# Process returns
for completed in cof.as_completed(threadlist):
    
    index = completed.page
    displaysys = progress
    if completed.exception():
        print(completed.exception())
        print(type(completed.exception()))
        if type(completed.exception()) in (req.exceptions.Timeout, req.exceptions.ReadTimeout):
            retry.append(index)
            displaysys.setval(index,'teal')
            displaysys=None
            continue
        print(f"Page number {index} raised this exception: {completed.exception()}")
        displaysys.setval(index,'red')
        displaysys = None
    
    displaysys, tables = MatWebTableFetch(thispage=completed.result(),pagenum=index,display=displaysys)
    
    displaysys, newmats = GetMatNameGUID(listoflitags=tables,pagenum=index,display=displaysys)
    

    if displaysys is None:
        retry.append(index)
    else:
        displaysys.setval(index,'lightgreen')
        
        
    matdict.update(newmats)
        
    
    

Box(layout=Layout(flex_flow='row wrap'))

In [119]:
with open('materialdump.txt','w') as data:
    for key,entry in matdict.items():
        data.write(f"{hex(key)} , {entry}\n")