# Install dependencies

In [7]:
import sys
def installModule(projectName:str, moduleName:str=None):
    '''Installs and loads the given module if not already installed'''
    if moduleName is None:
        moduleName=projectName
    if moduleName not in sys.modules:
        !python -m pip install --no-input $projectName
        print(f'{projectName} installed')
    else:
        print(f'{projectName} found')
    %reload_ext $moduleName

installModule('jupyter-xml')
installModule('jupyter-rdfify')
installModule('SPARQLWrapper')
installModule('tabulate')
installModule('spacy')

jupyter-xml found


jupyter-rdfify found


SPARQLWrapper found
tabulate found
spacy found


# Download Models

In [None]:
!python -m spacy download en_core_web_sm   # efficent
!python -m spacy download en_core_web_trf   # accurat

# Chemistry Example Wikidata Query
see http://wiki.bitplan.com/index.php/PyLoDStorage#15_Random_substances_with_CAS_number

## Extract text from website

In [20]:
from newspaper import Article
url="https://www.engr.psu.edu/ce/courses/ce584/concrete/library/construction/curing/Composition%20of%20cement.htm"
article = Article(url)
article.download()
article.parse()
text=article.text

# NLP with spacy
Try to identify [chemical compounds](https://www.wikidata.org/wiki/Q11173)

In [23]:
import spacy
from tabulate import tabulate
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load('en_core_web_trf')
doc = nlp(text)
nouns=[chunk.text for chunk in doc.noun_chunks]

print(f"Found nouns:\n {nouns}")

foundEntities=[{"Text":entity.text, "Entity Tag":entity.label_} for entity in doc.ents]
print(tabulate(foundEntities, headers="keys"))

Found nouns:
 ['Composition', 'cement', 'Introduction', 'Portland cement', 'its strength', 'chemical reactions', 'the cement', 'water', 'The process', 'hydration', 'a complex process', 'the chemical composition', 'cement', '\n\nManufacture', 'cement', 'Portland cement', 'the following materials', 'Lime', 'calcium oxide', 'CaO', 'limestone', 'chalk', 'shells', 'shale', 'calcareous rock', 'Silica', 'SiO', 'sand', 'old bottles', 'clay', 'argillaceous rock', 'sand', 'old bottles', 'clay', 'argillaceous rock', 'Alumina', 'Al', 'O', 'bauxite', 'recycled aluminum', 'clay', 'O', 'bauxite', 'recycled aluminum', 'clay', 'Iron', 'Fe 2 O', 'clay', 'iron ore', 'scrap iron', 'fly ash', '\n\nO', 'clay', 'iron ore', 'scrap iron', 'fly ash', 'Gypsum', 'CaSO', '.2H', 'limestone', 'Chemical shorthand', 'the complex chemical nature', 'cement', 'a shorthand form', 'the chemical compounds', 'The shorthand', 'the basic compounds', 'lime', 'silica', 'alumina', 'Compound Formula Shorthand form %', 'weight1', '

# Query wikidata for mentioned Chemical Compounds
The NER (Named Entity Recognition) seems not to detect the chemical compounds. Thus we use the found nouns to query wikidata for the referenced compounds

<div class="alert alert-block alert-warning">
<b>ToDo:</b> Optimize query some components that are referenced do not have all queried properties. Making the properties OPTIONAL results in an timeout.
</div>

In [25]:
from SPARQLWrapper import SPARQLWrapper, JSON, CSV
from tabulate import tabulate

queryValueFormatTags='%s,\n'*((len(nouns)//100)) + '%s'
q = """
SELECT DISTINCT ?substance ?substanceLabel ?formula ?structure ?CAS
WHERE { 
  ?substance wdt:P31 wd:Q11173.
  ?substance wdt:P231 ?CAS.
  ?substance wdt:P274 ?formula.
  ?substance wdt:P117  ?structure.
  ?substance rdfs:label ?substanceLabel
  FILTER(str(?substanceLabel) in ( %s ))
}
LIMIT 50

""" % queryValueFormatTags
values=tuple([', '.join([f'"{noun.strip()}"' for noun in nouns[n:n+100]]) for n in range((len(nouns)//100)+1)])
q = q % values
sparql = SPARQLWrapper("http://query.wikidata.org/sparql")
sparql.setQuery(q)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
table = [[result[column]["value"] for column in result] for result in results["results"]["bindings"]]
print(tabulate(table))

--------------------------------------  --------------------------------------------------------------------------  ---------  ----  ------
http://www.wikidata.org/entity/Q116269  http://commons.wikimedia.org/wiki/Special:FilePath/SiO2.svg                 7631-86-9  SiO₂  Silica
http://www.wikidata.org/entity/Q116269  http://commons.wikimedia.org/wiki/Special:FilePath/SiO2.svg                 7631-86-9  SiO₂  silica
http://www.wikidata.org/entity/Q283     http://commons.wikimedia.org/wiki/Special:FilePath/H2O%202D%20labelled.svg  7732-18-5  H₂O   water
http://www.wikidata.org/entity/Q283     http://commons.wikimedia.org/wiki/Special:FilePath/H2O%202D%20labelled.svg  7732-18-5  H₂O   water
http://www.wikidata.org/entity/Q283     http://commons.wikimedia.org/wiki/Special:FilePath/H2O%202D%20labelled.svg  7732-18-5  H₂O   water
http://www.wikidata.org/entity/Q283     http://commons.wikimedia.org/wiki/Special:FilePath/H2O%202D%20labelled.svg  7732-18-5  H₂O   water
http://www.wikidata.org/

<div class="alert alert-block alert-warning">
<b>ToDo:</b> Decide which SPARQL query framework sould be used and simplfy the access and interface for the usage in jupyter notebooks
</div>