## Imports and value setup

In [1]:
# webpage parser
import requests               as req
from bs4 import BeautifulSoup as bs4
from bs4 import element       as bel

# Used for a good-looking display/diagnostic system 
from IPython import display as dis
import ipywidgets as ipw

# Used to bypass IP-ban
from stem import process as tpr

## Object and method definitions

In [2]:
def get_tables(soup):
    tables = soup.find_all('table')
    if len(tables) != 2:
        raise ValueError("Wrong number or wrong type of tables!")
    for index, table in enumerate(tables):
        if "tabledataformat" not in table.get('class',None):
            bugloc = "material" if index else "properties"
            print(f' WARNING! {bugloc} table has wrong class {table.get("class",None)} ')
    return tables

def get_properties_types(properties_t):
    print([tag.get('class',None)for tag in  properties_t.find_all("th")])
    rows =[tag.string  for tag in  properties_t.find_all("th") 
           if (tag.get('class', None) is None)]
    return rows
                  

UsageError: Cell magic `%%pycodestyle` not found.


In [3]:
# Debugging list. Adds raw tags to this, so that I can analyze them later.
dbug = []
class _rangetype:
    __slots__ = ['upper','lower','average']
    def __init__(self, lower, upper):
        if upper < lower :
            raise ValueError("Second argument must be larger than first")
        self.upper = upper
        self.lower = lower
        self.average = upper + lower / 2
    def __set__(self,instance,value):
        raise AttributeError("Can't change these values!")
        
    def __str__(self):
        return f'{self.lower} - {self.upper}'

# Handles the numerous different formats present in the materials properties. 
class mat_property:
    __slots__ = ['units','value','condition','comparator','comments','data_extractors']

            
        
    class _unittype:
        __slots__ = ['metric','english','bounds','same']

        def __init__(self,metric, english = None, bounds = None):
            self.metric = metric
            self.bounds = bounds
            if english is None:
                self.same = True
            else:
                self.english = english 
                self.same = False
        def __call__(self,system='m'):
            # May have duplicate handling for bounds. Also 
            if system == 'b':
                return self.bounds
            
            if self.same:
                return self.metric
            
            if system == 'e':
                return self.english
            else:
                return self.metric

    # {'a|None|a|None|span', 'a|None|span', 'None|a|None|span', 'None', 'None|span'}
    # This is the setup. ['>='|'<='|'>'|'<'] {'<a...</a>'|[0-9]*} [UNIT]|[- {'<a...</a>'|[0-9]*} [UNIT]]
    # {'a|None|a|None|span', 'a|None|span', 'None|a|None|span', 'None', 'None|span'}
    # This is the setup. ['>='|'<='|'>'|'<'] {'<a...</a>'|[0-9]*} [UNIT]|[- {'<a...</a>'|[0-9]*} [UNIT]]

    # Define a function for each individual variant. 
    # Insert these functions into a dict
    # Have this dict indexed to the merged strings above
    # Have the apropriate function in the dict called, 
    # by fetching it using the coresponding merged string.
    
    ## NOTE: These methods are likely going to be completely replaced by CSS selectors.
    
    @staticmethod
    def None_ (tag):
        val = str(tag[0])
        if "-" in val:
            value = rangetype(*[s.strip() for s in val.split(' - ')])
        else:
            value = val

        return value, None, None, None

    # These are the methods to extract data from cells
    @staticmethod
    def None_span_(td_datacell):
        dc_results = str(td_datacell[0].string).split(" ")
        value, units = dc_results
        condition = td_datacell[1].string
        return value, units, condition, None

    

    @staticmethod
    def a_None_span_ (td_datacell):
        value = td_datacell[0].string
        units = td_datacell[1]
        condition = td_datacell[2].string
        return value,units,condition,None

    @staticmethod
    def None_a_None_span_ (td_datacell):
        value, units, condition = td_datacell.extract_data_cell(td_datacell[1:])
        bounds = str(td_datacell[0])
        return value, units, condition, bounds

    @staticmethod
    def a_None_a_None_span_ (td_datacell):
        low = str(td_datacell[0].string)
        high = str(td_datacell[2].string)
        units = str(td_datacell[3])
        condition = td_datacell[4].string
        return rangetype(low,high), units, condition, None


        
    def extract_data_cell (self,datacell):
        dbug.append(datacell)
        classline = "_".join([str(child.name) for child in datacell.children])+'_'
        try:
            parsefunc = self.data_extractors[classline]
            return parsefunc(datacell.contents)
        except IndexError:
            raise ValueError(f"No Valid parser for typelist {classline}. Full cell is \n{datacell.prettify()}")
            
        
        
    def __init__(self, mvalue=None, munits=None,
                 evalue=None, eunits=None, 
                 mcond=None, econd=None, extract=None ):
        if not(bool(extract) ^ all((mvalue,munits))):
            raise ValueError("Provide either both metric and english units, or a list of td tags to extract from. Not both.")
        if extract:
            self.value = None
            self.condition = None
            self.units = None
            self.comparator = None
            self.comments = None
            self.data_extractors = {  'None_'  : mat_property.None_,
                          'None_span_'         : mat_property.None_span_,
                        'a_None_span_'         : mat_property.a_None_span_,
                          'None_a_None_span_'  : mat_property.None_a_None_span_,
                        'a_None_a_None_span_'  : mat_property.a_None_a_None_span_
                       }
            datacells = extract.find_all("td", class_="dataCell")
            if len(datacells) == 1:
                value, units, cond, comp = self.extract_data_cell(datacells[0])
                self.value         = self._unittype(value)
                if units:
                    self.units     = self._unittype(units)
                if cond:
                    self.condition = self._unittype(cond)
                if comp:
                    self.comparitor = comp

            elif len(datacells) == 2:
                # The list comprehension simpily says, the first datacell is metric data
                # The second is english. Unpack and assign properly.
                metric  = self.extract_data_cell(datacells[0])
                english = self.extract_data_cell(datacells[1]) 
                self.value     = self._unittype(metric[0], english[0])
                self.units     = self._unittype(metric[1], english[1])
                if metric[2]:
                    print()
                    self.condition = self._unittype(metric[2],english[2])
                    print(f"SET CONDITION TO {self.condition} OF TYPE {type(self.condition)}")
            else:
                raise ValueError(f"{len(datacells)} datacells found (need 1-2)")
                
            commentcells = extract.find_all("td", class_="dataComment")
            if len(commentcells) == 1:
                self.comment = comentcells[0].string
            
            else:
                raise ValueError(f"{len(commentcells)} comments found (need 1)")
                
        
        # If parameters are given instead of table cell, build based on those instead.
        else:
            self.value = self._unittype(mvalue,evalue)
            self.units = self._unittype(munits,eunits)
            if mcond:
                self.condition = self._unittype(mcond,econd)
            else:
                self.condition = None

        
            
    def __str__(self):
        def print_system(value, units, condition, comparator, system = None, prefix = None):
            res = ""
            res += f'{system}: ' if system else ""
            res += comparator if comparator else ""
            res += value(prefix)
            res += f' {units(prefix)}' if (units is not None and units(prefix)) else ""
            res += f' Cond: {condition(prefix)}\n' if (condition is not None and condition(prefix)) else "\n"
            return res
        if self.units is None or self.units.same:
            res = print_system(self.value,self.units,self.condition,self.comparator)
            
        else:
            res  = print_system(self.value, self.units, self.condition, self.comparator, system = 'Metric')
            res += print_system(self.value, self.units, self.condition, self.comparator, system = 'English', prefix = 'e')
        return res
    
            

In [4]:
def parse_properties(properties_t):
    collected = dict()
    rows = properties_t.find_all("tr")
                  
    next_key   = None
    next_vdict = dict()
    for row in rows:

        if row.th:
            rowcells = row.find_all("th")
            if next_key:
                #print(f"Assigned {next_key} to {list([ (item,value) for item, value in next_vdict.items()])}")
                collected[next_key] = next_vdict.copy()
                next_vdict = dict()
                  
            next_key = rowcells[0].text 
            #print(f"Assigned next_key to {next_key}")
                   
        elif row.td:
            if row.td.has_attr('colspan'):
                continue
            parsed_property = mat_property(extract=row)
            print(parsed_property)
            next_vdict[row.td.text] = parsed_property
        else:
            print("Warning! unable to parse row! Dumping...")
            print(row.prettify())
    #print(f"Assigned {next_key} to {list([(item,str(value)) for item, value in next_vdict.items()])}")
    collected[next_key] = next_vdict.copy()
    next_vdict = dict()
    return collected
    

In [5]:
def parse_material (material_t):
    namestring = ""
    notesdict = dict()
    for table_row in material_t.tbody.find_all("tr"):
        if table_row.th: #May not work, check
            namestring = str(table_row.th.string).strip()
        elif len(table_row.find_all("td")) == 2:
            
            property_name = list(table_row.td.strings)[0]
            property_string = ""
            for entry in table_row.find_all("td")[1].children:
                if type(entry) is bel.NavigableString:
                    property_string += str(entry)
                elif entry.name == 'a' and entry['href']:
                    property_string += f"[{entry['href']}]<{entry.string}>"
                elif entry.name == 'p':
                    property_string += "\n\t"
                else:
                    raise ValueError(f"Cannot handle this content: \n {entry.prettify()}")
            notesdict[property_name] = property_string
        elif 'border:thin 1px black; padding:0xp;' in table_row.td.get('style',[]):
            continue        
        else:
            raise ValueError(f"No function for children {table_row.contents}")
    return  namestring, notesdict
            

## Network setup and fetch

#### Tor setup, diagnostics, and debugging

In [6]:
tor_session, tor_process = generate_tor_session(9965)

NameError: name 'generate_tor_session' is not defined

In [16]:
# If you need a new tor session, kill the old one first, or you'll need to find a new port
tor_process.kill()

In [15]:
# Check to see if you are banned on this IP or not. 
guid = f"{samplematerials.GUID[0]:032x}"
print(guid)
test = tor_session.get(printurl,params={'matguid':guid})
test.url

00342ed4346d455db09f8359f8cd73aa


'http://www.matweb.com/search/datasheet_print.aspx?matguid=00342ed4346d455db09f8359f8cd73aa'

In [16]:
print(tor_session.cookies.items())
print(len(samplematerials.GUID))

[]
130298


#### Page batch fetching

In [17]:

url_list = [printurl+f"?matguid={guid:032x}" for guid in samplematerials.GUID[0:100]]
batch = request_batch(url_list, batchsize=20, session = tor_session)
results = batch.run()

url_list is now 80 items long
url_list is now 60 items long
url_list is now 40 items long
url_list is now 20 items long
url_list is now 0 items long


## Page Processing

In [18]:
soups = [bs4(result.result().text) for result in results]
#print(soups[0].prettify())
testsoup = soups[0]