In [18]:
from bs4 import BeautifulSoup as bs4
from bs4 import element as elm
import requests as req
import libs.tree_visualizer as tv
import libs.tor_session as ts
import bleach
import base64 as b64

In [19]:
import urllib.parse as parse

In [20]:
proxy_ses = req.Session()
proxy_ses.proxies = {'http':'socks4://178.249.219.91:4145'}
proxy_ses.headers['User-agent'] ="Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.23 (KHTML, like Gecko) Version/10.0 Mobile/14E5239e Safari/602.1"

In [21]:
mat_id = b64.b16encode(b64.decodebytes(b"i9DTEEuJTq22QBLrOtFVdw==")).decode('ascii').lower()
mat_id = "dd0e92f43d514879a50cbd01117da56e"
url = f"http://www.matweb.com/search/datasheet_print.aspx?matguid={mat_id}"
url

'http://www.matweb.com/search/datasheet_print.aspx?matguid=dd0e92f43d514879a50cbd01117da56e'

Format:
`{"Category":{"PropertyName":[[metricValue,EnglishValue,Comment],[metricValue,englishValue,Comment],[]]`

In [25]:
# Because Proxies are asshats, and I get man-in-the-middled every ten minutes.
# Between this, a inability to execute javascript with the libraries I have here,
# And disabling certain features in MongoDB to again prevent script injection
# I should be apropriately paranoid. 
def san_matweb_table(tabletext):
    ### Tags, categorized for convienence ###
    table = ['tr','td','th','table','tbody']
    structure = ['body','div','a','p','span']
    formatting = ['b','strong','i','em','mark','small','del','ins','sub','sup']
    
    tags = table + structure + formatting
    attributes = ['href','title','class','colspan','id','style']
    styles = []
    protocols =['http']
    
    return bleach.clean(tabletext,tags,attributes,strip=True,strip_comments=True)
    
    

In [43]:
class ContentError(ValueError):
    def __str__(self):
        v =  f"'{self.note}'\n" if self.note else ""
        v += f"Path: {'>'.join([t.name for t in self.tag.parents])} Attributes:\n"
        v += f"{str(self.tag.attrs)}\n"
        v += f"Failed when parsed by {self.parserObject}\n"
        v += f"returned {str(self.result)}." if self.result else ""
        return v

    
    def __init__(self, tag, parserObject, result=None, note = None):
        self.tag = tag
        self.parserObject = parserObject
        self.result = result
        self.note = note
        super().__init__()
    

class Value:
    units =  None
    unitID = None
    value = 0
    condition = None
    
    @staticmethod
    def _parse(tag):
        #print(dir(tag))
        
        u_tag = tag.find(attrs={"class":"unitlink"})
        c_tag = tag.find(attrs={"class":"dataCondition"})
        
        children = list(tag.children)
        _ = children.remove(u_tag) if u_tag in children else None
        _ = children.remove(c_tag) if c_tag in children else None
        
        
        # Isolating important data.
        s_tag =""
        try:
            for child in children:
                if isinstance(child,elm.NavigableString):
                    s_tag += str(child)
                else:
                    s_tag += "".join(child.stripped_strings)
        except KeyError as e:
            raise ContentError(tag,Value._parse,KeyError,message="The weird error happened again")
        
        # Loading that up into returnable variables.
        condition = "".join(c_tag.stripped_strings) if c_tag else None      

        if u_tag:
            print(parse.urlparse(u_tag['href']).params)
            ext_nums = ""
            for ch in u_tag['href'].partition("fromID=")[2]:
                if ch.isdigit():
                    ext_nums += ch
                else:
                    break
            unitID = int(ext_nums)
            units  = s_tag
            value = u_tag.text
        else:
            value = s_tag
            units = None
            unitID = None
        
        return units,unitID,value,condition
        
        
        
    
    
    def __init__(self,name,value=None, units=None):
        # If just plugging in values
        if isinstance(name,str):
            if isinstance(value,(float,int,tuple,str)) and isinstance(units,(str)):
                if isinstance(value,tuple) and len(value) != 2:
                    raise ValueError("If a range is provided, a maximum and minimum must be given")
                self.units = units
                self.value = value
            else:
                raise TypeError("Value must be a number or tuple, units must be a string")
        elif isinstance(name,elm.Tag):
            #print("initializing tagwise")
            self.tree = name
            self.units,self.unitID,self.value,self.condition = Value._parse(name)
        
    def j_struct(self):
        
                    
            
    def __str__(self):
        return f"{self.value} {self.units or ''}\n{self.condition}"
    


In [29]:

# TODO:
# error handling (check presence of required feilds)
# Json object conversion method (Ensure propper escaping of chars for security)

class Info:
    # Categorys as a tuple of name and ID
    categories = list()
    notes= ""
    name = ""
    keywords = list()
    # Other feilds, with limited processing.
    other = dict()
    
    @staticmethod
    def _parse_info(info_tag):
        # Take a provided tag object, and exctract data from it.
        rows = info_tag.select('tr')
        info_dict = dict()
        for row in rows:
            if row.th is not None:
                name = row.th.get_text().strip()
                continue
            cells = row.select('td')
            if len(cells) != 2:
                continue
            prop_name = cells[0].get_text().strip()
            prop_value = cells[1]
            info_dict[prop_name] = prop_value
        return name,info_dict
    
    @staticmethod
    def _parse_categories(tag):
        categories = list()
        for cat in tag.find_all("a"):
            name  = cat.string 
            catID = int(parse.parse_qs(cat['href']).popitem()[1][0])
            categories.append((name,catID))
        return categories
            
                
    def __init__(self,tag):
        self.name,info_dict= self._parse_info(tag)
        self.categories = self._parse_categories(info_dict.pop("Categories:"))
        self.keywords =[e.strip() for e in info_dict.pop("Key Words:").string.split(';')]
        self.notes = info_dict.pop("Material Notes:").prettify()
        self.other = info_dict
        
        
    def __str__(self):
        prefix="\n-- "
        return f"""
###{self.name}###
Categories: {'; '.join([f'{cat} ({c_id})'for cat,c_id in self.categories])}
Key Words: {'; '.join(self.keywords)}
Material Notes: {prefix.join(self.notes.splitlines())}

 Other:
     {list(self.other.keys())}

"""
            

In [30]:
# TODO
# Count number of categories, and feilds in each
# Strip Forbidden characters from category and property names.
# Json conversion Method.

class Properties:
    @staticmethod
    def _parse_props(prop_tag,debug=False):
        def headerparse(row,pointer,debug=False):
            fallback = None
            

            if row.th is not None:
                rowname = row.th.string.replace("Properties","").strip()
                _ = print(f"Header row {rowname}") if debug else None
                newdict = dict()
                pointer[0][rowname] = newdict
                pointer[1] = newdict
                return True
            else:
                return fallback
        
        def emptyrow(row,pointer,debug=False):
            fallback = None
            
            chil = list(row.children)
            if row.td.get('colspan',None) != "4":
                return fallback
            _ = print("Blank Row. Resetting pointer") if debug else None
            pointer[2] = None
            pointer[1] = None
            return True
        
        def prop_row(row,pointer, debug=False):
            # Not Done yet
            fallback=None
            
            datatitle = None
            datavalue = list()
            datadesc = ""
            debugstr = "prop row. Order: "
            
            for cell in row.children:
                try:
                    # print(f"Cell class {cell.get('class',None)}")
                    if "dataCell" in cell['class'] :
                        debugstr += "data -> "
                        c = Value(cell)
                        datavalue.append(c)


                    elif 'dataComment' in cell['class'] :
                        datadesc = "".join(cell.stripped_strings)
                        if datadesc:
                            debugstr += f"comment ({len(datadesc) if datadesc else 'X'}) -> "
                    else:
                        print(f"FAULT. Cell had class {cell['class']}, which was unrecognized. cell was:")
                        print(cell.prettify())
                except KeyError as e:
                    # It's a title.
                    if any(cell.stripped_strings):
                        debugstr += "title -> "
                        datatitle = "|".join(cell.stripped_strings)
            if datatitle:
                this_prop = list()
                pointer[1][datatitle] = this_prop
                pointer[2] = this_prop
            
            pointer[2].append(datavalue+([datadesc] if datadesc else []))
            
            
            _ = print(debugstr, f"({datatitle})") if debug else None
            return True
        
        prop_dict = dict()
        funcdict = [headerparse,emptyrow,prop_row]
        pointer = [prop_dict,None,None]

        for count,row in enumerate(prop_tag.select('tr')):
            _ = print(f"Row number {count}... ",end="") if debug else None
            for f in funcdict:
                res = f(row,pointer,debug)
                if res is not None:
                    #print(f"Used: {f.__name__}")
                    break
        return prop_dict
            
            
    @classmethod
    def _test(this,tag):
        prop_dict = this._parse_props(tag,debug=True)
        return prop_dict
        
        
    def __init__ (self, tag):
        self.prop_dict = self._parse_props(tag)
    
    def __str__ (self):
        rolling = ""
        for key,entries in self.prop_dict.items():
            rolling += f"\n\n===---{key}---===\n"
            for name, dat in entries.items():
                rolling +=f"\n--> {name}:\n"
                for idx,entry in enumerate(dat):
                    vals = [str(et).splitlines() for et in entry if isinstance(et,Value)]
                    for lines in zip(*vals):
                        rolling += "- "+str(idx)+" "+"  ".join([line.ljust(30) for line in lines])+"\n"
        return rolling



            
                    
                    
                
               
    
    


In [46]:
# TODO
# Mongo Loading Method



class Material:
    @staticmethod
    def _parse_mat(soup):
        tables = soup.select('div[id*=DataSheet]>table')
        if len(tables) != 2 :
            print(soup.prettify())
            raise ContentError(soup, Material._parse_mat, f"{len(tables)} tables",
                               "Wrong number of tables!")
        info_object = Info(tables[0])
        prop_object = Properties(tables[1])
        
        return info_object, prop_object
    
    def download(self,session,reload=False):
        if not self.fetched or reload:
            resp = session.get(f"http://www.matweb.com/search/datasheet_print.aspx?matguid={self.matID}")
            print(f"Got {resp}")
            info,prop=self._parse_mat(bs4(san_matweb_table(resp.text), 'html.parser'))

            self.info = info
            self.properties = prop
            self.fetched = True

    
    def make_task(self,session, reload = False):
        pass
    
    def __init__(self, matID, fetchwith = None):
        print(f"Received {matID}")
        if isinstance(matID,str):
            print("Used as is")
            self.matID = matID
        else:
            print("Converted")
            self.matID = b64.b16encode(b64.decodebytes(matID)).decode('ascii').lower()
        self.fetched   = False
        if fetchwith is not None:
            print("Fetching")
            self.download(fetchwith)
        else:
            print("Not Fetching")
    
    def __getattr__ (self,name):
        if not self.fetched and name in ["info","properties"]:
            raise AttributeError("Haven't downloaded page yet!")
    
    def __str__(self):
        return str(self.info) + "\n" + str(self.properties)
        

In [26]:
proxy_ses.get("http://checkip.amazonaws.com/").text
resp = proxy_ses.get(url)
resp.text
matsoup = bs4(san_matweb_table(resp.text), 'html.parser')
tables = matsoup.select('div[id*=DataSheet]>table')
len(tables)

In [47]:
test = Material("dd0e92f43d514879a50cbd01117da56e")

Received dd0e92f43d514879a50cbd01117da56e
Used as is
Not Fetching


In [50]:
test.download(proxy_ses)

Got <Response [200]>




In [54]:
print(test)


###1120 Aluminum Composition Spec###
Categories: Metal (9); Nonferrous Metal (177); Aluminum Alloy (178); 1000 Series Aluminum (201)
Key Words: UNS A91120; Aluminium 1120; AA1120; Al1120
Material Notes: <td>
--  This data sheet has only a limited amount of property data.  Most MatWeb aluminum entries, especially entries with a specific temper in the name, have much more property data.  Data points with the AA note have been provided by the Aluminum Association, Inc. and are NOT FOR DESIGN.
--  <p>
--   <b>
--    Composition Notes
--   </b>
--   : The aluminum content for unalloyed aluminum not made by a refining process is the difference between 100.00 percent and the sum of all other analyzed metallic elements present in amounts of 0.010 percent of more each, expressed to the second decimal before determining the sum.  For alloys and unalloyed aluminum not made by a refining process, when the specified maximum limit is 0.XX, an observed value or a calculated value greater than 0.005 

In [23]:
props = Properties(tables[1])
print(props)



{'Physical': {'Density': [[<__main__.Value object at 0x000002D8FE41DD88>, <__main__.Value object at 0x000002D8FE41D448>, 'AA; Typical']]}, 'Component Elements': {'Aluminum, Al': [[<__main__.Value object at 0x000002D8FE41D388>, <__main__.Value object at 0x000002D8FE41DA08>, 'Specified']], 'Boron, B': [[<__main__.Value object at 0x000002D8FE41D988>, <__main__.Value object at 0x000002D8FE41DC08>]], 'Chromium, Cr': [[<__main__.Value object at 0x000002D8FE41DB48>, <__main__.Value object at 0x000002D8FE41D048>]], 'Copper, Cu': [[<__main__.Value object at 0x000002D8FE459A08>, <__main__.Value object at 0x000002D8FE459388>]], 'Gallium, Ga': [[<__main__.Value object at 0x000002D8FE459B48>, <__main__.Value object at 0x000002D8FE459A48>]], 'Iron, Fe': [[<__main__.Value object at 0x000002D8FE459848>, <__main__.Value object at 0x000002D8FE459448>]], 'Magnesium, Mg': [[<__main__.Value object at 0x000002D8FE459B88>, <__main__.Value object at 0x000002D8FE459CC8>]], 'Manganese, Mn': [[<__main__.Value 

In [19]:
for key,entries in props._test(tables[1]).items():
    print(f"\n\n===---{key}---===")
    for name, dat in entries.items():
        print(f"\n--> {name}:")
        for idx,entry in enumerate(dat):
            vals = [str(et).splitlines() for et in entry if isinstance(et,Value)]
            for lines in zip(*vals):
                print("-",idx,"  ".join([line.ljust(30) for line in lines]))
            

Row number 0... Blank Row. Resetting pointer
Row number 1... Header row Physical
Row number 2... 

prop row. Order: title -> data -> data -> comment (11) ->  (Density)
Row number 3... Blank Row. Resetting pointer
Row number 4... Header row Component Elements
Row number 5... prop row. Order: title -> data -> data -> comment (9) ->  (Aluminum, Al)
Row number 6... prop row. Order: title -> data -> data ->  (Boron, B)
Row number 7... prop row. Order: title -> data -> data ->  (Chromium, Cr)
Row number 8... prop row. Order: title -> data -> data ->  (Copper, Cu)
Row number 9... prop row. Order: title -> data -> data ->  (Gallium, Ga)
Row number 10... prop row. Order: title -> data -> data ->  (Iron, Fe)
Row number 11... prop row. Order: title -> data -> data ->  (Magnesium, Mg)
Row number 12... prop row. Order: title -> data -> data ->  (Manganese, Mn)
Row number 13... prop row. Order: title -> data -> data ->  (Other, each)
Row number 14... prop row. Order: title -> data -> data ->  (Other

In [89]:
print(Info(tables[0]))



###1120 Aluminum Composition Spec###
Categories: Metal (9); Nonferrous Metal (177); Aluminum Alloy (178); 1000 Series Aluminum (201)
Key Words: UNS A91120; Aluminium 1120; AA1120; Al1120
Material Notes: <td>
--  This data sheet has only a limited amount of property data.  Most MatWeb aluminum entries, especially entries with a specific temper in the name, have much more property data.  Data points with the AA note have been provided by the Aluminum Association, Inc. and are NOT FOR DESIGN.
--  <p>
--   <b>
--    Composition Notes
--   </b>
--   : The aluminum content for unalloyed aluminum not made by a refining process is the difference between 100.00 percent and the sum of all other analyzed metallic elements present in amounts of 0.010 percent of more each, expressed to the second decimal before determining the sum.  For alloys and unalloyed aluminum not made by a refining process, when the specified maximum limit is 0.XX, an observed value or a calculated value greater than 0.005 

In [None]:
for key,value in stuff.info_dict.items():
    print(key,"=\n",value,"\n-------------------------\n")