In [4]:
# webpage parser
import requests               as req
from bs4 import BeautifulSoup as bs4
from bs4 import element       as bel

# Used for a good-looking display/diagnostic system 
from IPython import display as dis
import ipywidgets as ipw

# Used for asynchronous webpage fetching
import concurrent.futures as cof 
from requests_futures.sessions import FuturesSession

# Used to process and hold data in a efficient way
import pandas as pan

# Used to bypass IP-ban
from stem import process as tpr

# Used for delays and stuff
import time

# Used to create folders and stuff
import os

# Used for a function that compares html trees
import mmh3 as mmh

# Used to parse the cells once I have a hold of them. 
import re

# Used for various random shuffling/random selection processes
import random as ra

In [1]:
printurl = "http://www.matweb.com/search/datasheet_print.aspx"
guidpath = "materialdump.csv"
outpath = "pagefetch"

In [5]:


samplematerials = pan.read_csv(guidpath, header=None, names=["GUID","Database_title"],
                               converters = {0: (lambda val : int(val,0))})
samplematerials

Unnamed: 0,GUID,Database_title
0,43130625942163489998690947383868747649,3M Fluorinert™ FC-40 Electronic Liquid
1,307585611052568925740805591876357198776,3M Fluorinert™ FC-43 Electronic Liquid
2,268469469179882343023565952335793721412,3M Fluorinert™ FC-70 Electronic Liquid
3,301124261880456456456866588602462618481,3M Fluorinert™ FC-72 Electronic Liquid
4,128599432461509673218328583769946166538,3M Fluorinert™ FC-770 Electronic Liquid
5,47999019616973892578738322954385992613,3M FM001 Indoor UL Recognized Clear Polyester...
6,147045154763180353154048419978303045581,3M FM002 Indoor UL Recognized Clear Polyester...
7,187142889824498279744955635760609593527,3M FM009 Indoor UL Recognized Clear Polyester...
8,91896533215546320458982735638659781930,3M FM00G Indoor UL Recognized Clear Polyester...
9,222308646300166733750464238568246588770,3M FM00N Indoor UL Recognized Clear Polyester...


In [4]:
class request_batch:
    def __init__ (self, url_list, Executor=None, session=req.Session(),
                  batchsize=5, workers=None, batch_wait=0, 
                  wait_on=cof.ALL_COMPLETED):
        
        self.workers = batchsize if workers is None else workers 
        
        
        if Executor is None:
            self.Executor = cof.ProcessPoolExecutor(max_workers = self.workers)
        else:
            self.Executor = Executor
            
        self.wait_on    = wait_on
        self.session    = session
        self.url_list   = url_list
        self.batchsize  = batchsize
        self.batch_wait = batch_wait
        
        self.fsession   = FuturesSession(executor = self.Executor, session = self.session )
    
    
    def run(self):
        url_list = self.url_list[:]
        futures = []
        while len(url_list):
            # Quene up the urls
            futures += [self.fsession.get(url) for url, batch 
                        in zip(url_list, range(self.batchsize)) ]
            # Remove already quened urls
            if len(url_list) < self.batchsize:
                url_list = list()
            else:
                url_list = url_list[self.batchsize:]
            print(f"url_list is now {len(url_list)} items long")
            
            # Waiting for wait_on, then aditionaly wait batch_wait seconds
            cof.wait(futures, return_when=self.wait_on)
            time.sleep(self.batch_wait)
        return futures


In [6]:
def generate_tor_session(port):
    print("STARTING...\n\n")
    config = {
        'SocksPort':str(port),
    }
    tor_process = tpr.launch_tor_with_config( config=config, 
                                              take_ownership=True, 
                                              close_output=True)
    
    session = req.Session()
    session.proxies = {}
    print("LOCAL IP IS:")
    print(req.get("http://httpbin.org/ip").text,"\n\n\n")
    session.proxies['http'] = f'socks5h://localhost:{port}'
    session.proxies['https'] = f'socks5h://localhost:{port}'
    session.headers['User-agent'] = "Mozilla/5.0 (X11; Linux x86_64)"
    session.cookies.get_policy().set_allowed_domains([])
    print("REMOTE IP IS:")
    print(session.get("http://httpbin.org/ip").text,"\n\n\n")
    return session, tor_process




In [9]:
def save_array_of_page_soups (array, foldername = "pagefetch", filename="Page{0}.html"):
    os.mkdir(foldername)
    for index,soup in enumerate(array):
        filename = f'{foldername}/{filename.format([index])}'
        with open(filename,'w') as file:
            file.write(soup.prettify())
            
            

In [86]:
class regexes:
    def __init__(self):
        self.tagloc=re.compile(r'.*Replacing.*')
regexes = regexes()


class button_grid:
    def __init__(self,iterator):
        self.buttonlist = dict()
        for i in iterator:
            newbut = ipw.Button(value=True,
                                description=str(i), 
                                layout=ipw.Layout(width='42px', height='25px'),
                                disabled=True)
            self.buttonlist[i]=newbut
        container=ipw.Box(tuple(self.buttonlist.values()))
        container.layout.flex_flow = "row wrap"
        dis.display(container)
    def setval (self, index, value):
        self.buttonlist[index].style.button_color = value

        
#### Extracts the list of materials and ids from the webpage. 
def MatWebTableFetch (thispage,pagenum,display=None):
        if (thispage.status_code != 200 and display):
            display.setval(pagenum,'maroon')
            display = None           
             
        PageHTML = bs4(thispage.text,'html.parser')
        Tagloc = PageHTML.find_all(text=regexes.tagloc)
        if len(Tagloc) != 1:
            print(f"Did not find single tag locator. Found {len(Tagloc)} of them in p.{pagenum}")
        Tagloc = Tagloc[0]
        FetchedTable = Tagloc.find_all_next('ul')
        if len(FetchedTable) != 1:
            print(f"Did not find a single table. Found {len(Tagloc)} of them in p.{pagenum}")
        FetchedTable = FetchedTable[0]
        return display, FetchedTable.find_all("li")

#### Extracts the entries from the HTML formatted list
def GetMatNameGUID (lisctoflitags,pagenum, display=None):
        outputdict = dict()
        for item in listoflitags:
            # The URL contains a matguid value. This must be extracted from the rest
            extractedURL  = str(item.a["href"])
            GUIDstring = extractedURL.split("matguid=", 1)[1]
            extractedGUID = int(GUIDstring,16)
            # The material name is formatted using <sub> tags. These must be removed.
            # I replace them with LaTeX syntax.
            matname = str()
            for child in item.a.children:
                if type(child) is bel.NavigableString :
                    matname+=str(child)
                if type(child) is bel.Tag:
                    if child.name == 'sub':
                        addendum = str(child).replace('<sub>','_{').replace('</sub>','}')
                        #print(matname)
                        matname += addendum
            outputdict[extractedGUID] = matname
        
        if display and len(outputdict) == 0:
            display.setval(pagenum,'purple')
            display = None
        elif display and len(outputdict) < 250:
            display.setval(pagenum,'fuchsia')
            display = None
            
        return display,outputdict
    





In [None]:
for completed in cof.as_completed(threadlist):
    
    index = completed.page
    displaysys = progress
    if completed.exception():
        print(completed.exception())
        print(type(completed.exception()))
        if type(completed.exception()) in (req.exceptions.Timeout, req.exceptions.ReadTimeout):
            retry.append(index)
            displaysys.setval(index,'teal')
            displaysys=None
            continue
        print(f"Page number {index} raised this exception: {completed.exception()}")
        displaysys.setval(index,'red')
        displaysys = None