# Retrieval of species data for Wikidata

In [None]:
import time
import pandas as pd
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from SPARQLWrapper import SPARQLWrapper, JSON, XML

## Preparation
### Namings

In [None]:
# paths to files
path_ids = '../data/wikidata/wd_identifier_list_1.csv'
path_XML = '../xml/wd_species_target.xml'
path_df  = '../data/wikidata/wd_species_df.csv'

In [None]:
mapping_dict = {
   'resource': 'ID', 
   #'family',
   #'class',
   #'order',
   'taxonName': {
       'SubElement': 'Scientific_Names',
       'SubSubElement': 'Scientific_Name'
   },
   'taxonCommonName': {
       'SubElement': 'Common_Names',
       'SubSubElement': 'Common_Name'
   },
   #'differentFrom',
   #'endemicTo',
   #'conservationStatus',
   'resourceLabel': {
       'SubElement': 'Labels',
       'SubSubElement': 'Label'
   }, 
   'familyLabel': {
       'SubElement': 'Familys',
       'SubSubElement': 'Family'
   },
   'classLabel': {
       'SubElement': 'Categories',
       'SubSubElement': 'Category'
   },
   'orderLabel': {
       'SubElement': 'Orders',
       'SubSubElement': 'Order'
   },
   'differentFromLabel': {
       'SubElement': 'Different_Species',
       'SubSubElement': 'Different_From'
   },
   'endemicToLabel': {
       'SubElement': 'Endemic_Regions',
       'SubSubElement': 'Endemic_To'
   },
   'conservationStatusLabel': 'Conservation_Status'
}

### Queries

In [None]:
species_query = """

SELECT ?resource

WHERE {
  ?resource wdt:P105 wd:Q7432.
}
"""


In [None]:
attribute_query = """

SELECT DISTINCT
  ?resource
  ?family
  ?class
  ?order
  ?taxonName ?taxonCommonName 
  ?differentFrom ?endemicTo ?conservationStatus
  
  
  ?resourceLabel
  ?familyLabel
  ?classLabel
  ?orderLabel
  ?differentFromLabel ?endemicToLabel ?conservationStatusLabel


WHERE 
{{
  
  VALUES ?resource {{<{}>}}
  
  ## optional variables 
  
    OPTIONAL {{?resource wdt:P171+ ?family.
              ?family wdt:P105 wd:Q35409.
           
           OPTIONAL {{?family wdt:P171+ ?order.
                     ?order wdt:P105 wd:Q36602.
                    
                    OPTIONAL {{?order wdt:P171+ ?class.
                              ?class wdt:P105 wd:Q37517.}}}}}}
  
  # Taxon name
  OPTIONAL {{?resource wdt:P225 ?taxonName.
              FILTER (langMatches( lang(?taxonName), "en" ) )}}
  # Taxon common name
  OPTIONAL {{?resource wdt:P1843 ?taxonCommonName.
              FILTER (langMatches( lang(?taxonCommonName), "en" ) )}}
  # Different from 
  OPTIONAL {{?resource wdt:P1889 ?differentFrom.}}
  # endemic to
  OPTIONAL {{?resource wdt:P183 ?endemicTo.}}
  # conservation status 
  OPTIONAL {{?resource wdt:P141 ?conservationStatus.}}
  
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}

"""

### Function Definitions 

In [None]:
def runQuery(query, format):

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    sparql.setQuery(query)
    sparql.setReturnFormat(format) 

    try :
        results = sparql.query().convert()
        #print('Nr. of results: ', len(results['results']['bindings']))
    except :
        deal_with_the_exception()

    return results
  

In [None]:
def collectAndSaveIdentifiers():
    
    # run sparql query
    species_results = runQuery(species_query, JSON)
    
    # retrieve identifier
    identifier_list = []
    for identifier in species_results['results']['bindings']:
        identifier_list.append(identifier['resource']['value'])

    # write down list as csv file
    df_identifier_list = pd.DataFrame(identifier_list, columns = ['resources'])
    df_identifier_list.to_csv(path_ids,  index = False)
        
    return identifier_list

In [None]:
def instantiateDataframe(identifier_list):
    # test whether result collection dataframe already defined, 
    # else retrieve columns from query and instantiate dataframe 
    try: 
        df_results = pd.read_csv(path_df)

    except:
        results = runQuery(attribute_query.format(identifier_list[0]), JSON)
        columns = results['head']['vars']
        df_results = pd.DataFrame(columns = columns)

    return df_results

In [None]:
def convertResultsToDict(results):

    # create new results dict to collect and combine attributes
    r_dict = {}

    # loop through different result sets for this resource
    for result_set in results['results']['bindings']:


        # loop through all attributes for this result set
        for attribute in result_set:
            #print(attribute, result_set[attribute]['value'])

            # assign new value (in lower case)
            new_value = result_set[attribute]['value'].lower()

            # if attribute not yet seen, add to dict
            if attribute not in r_dict.keys():
                # assign string to attribute 
                r_dict[attribute] = new_value

            # if attribute already in dict
            else:
                old_value = r_dict[attribute]

                # if value is single string, create list of old and new value and assign back
                if type(old_value) == str and old_value != new_value:
                    l = []
                    l.append(old_value)
                    l.append(new_value)
                    r_dict[attribute] = l

                # if value is list, append new_value
                elif type(old_value) == list and new_value not in old_value:
                    old_value.append(new_value)

                #else:
                #    raise Exception('Merging of results in JSON: Formating of dictionary values wrong')

    return r_dict
    


In [None]:
def appendDataframe(r_dict, df_results):
    
    # append new entry to backup dataframe            
    df_new_entry = pd.DataFrame.from_dict([r_dict])
    df_results = pd.concat([df_results, df_new_entry])

    # save dataframe
    df_results.to_csv(path_df, index=False)

    return df_results

In [None]:
def appendXML(root, r_dict, mapping_dict):

    # for each new resource, create new species
    new_element = SubElement(root, 'Species')

    # loop through dictionary
    for key in r_dict:
        if key in mapping_dict.keys():

            # try whether nested elements are provided by the mapping dictionary
            try:
                new_subelement = SubElement(new_element, mapping_dict[key]['SubElement'])

                # differentiate bewteen list elements and simple strings
                if type(r_dict[key]) == str:
                    new_subsubelement = SubElement(new_element, mapping_dict[key]['SubSubElement']).text = r_dict[key]
                else:    
                    for value in r_dict[key]:
                        new_subsubelement = SubElement(new_element, mapping_dict[key]['SubSubElement']).text = value

            # else only use the direct term - applies if no list of elements expected (ID, conservation status, etc--)
            except:    
                new_subelement = SubElement(new_element, mapping_dict[key]).text = r_dict[key]
                
                
    # writing XML file out
    tree = ET.ElementTree(root_target)
    tree.write(path_XML)

    return root

## Retrieval & Processing

### Retrieval of  list of species


In [None]:
%%time

# try reading in list of identifiers
try:
    retrieved_identifiers = list(pd.read_csv(path_ids)['resources'])

    # if list longer 100, consider as full list, not mock, example
    if len(retrieved_identifiers) > 100:
        identifier_list = retrieved_identifiers
    else:
        identifier_list = collectAndSaveIdentifiers()
except:
    identifier_list = collectAndSaveIdentifiers()

In [None]:
len(identifier_list)

### Retrieval of attributes for each species & processing to Dataframe and XML

In [None]:
# try to read in root, otherwise create
try:
    tree = ET.parse(path_XML)
    root_target = tree.getroot()
except:
    ## creation of new root 
    root_target = ET.Element('animals_and_plants')
    
# Datagrame    
df_results = instantiateDataframe(identifier_list)

In [None]:
%%time

# set offset based on amount of already queried species
offset = len(df_results)

for identifier in identifier_list[offset:]:
    
    #retrieval of attributes per species
    results = runQuery(attribute_query.format(identifier), JSON)
    
    # convert and merge results in dict
    r_dict = convertResultsToDict(results)
    
    # append new entries
    df_results = appendDataframe(r_dict, df_results)

    # append xml file
    root_target = appendXML(root_target, r_dict, mapping_dict)
    
    time.sleep(60)
    
ET.dump(root_target)

In [None]:
df_results