In [21]:
import math
import json
import requests
import os
import pandas as pd
import mdutils
import urllib.parse
from wikidataintegrator import wdi_core

In [83]:
project = "wikiproject-biodiversity"
license = "cc0,cc-by"
language = "en"

In [84]:
url = f"https://api.inaturalist.org/v1/observations/species_counts?photo_license={license}&quality_grade=research&per_page=500&project_id={project}"
print(url)

https://api.inaturalist.org/v1/observations/species_counts?photo_license=cc0,cc-by&quality_grade=research&per_page=500&project_id=wikiproject-biodiversity


In [86]:
result = json.loads(requests.get(url).text)
for page in range(1, math.ceil(result["total_results"]/500)+1):
    try:
        print(page)
        nextpageresult = json.loads(requests.get(url+"&page="+str(page)).text)
        if "results" in nextpageresult:
            for obs in nextpageresult["results"]:
                result["results"].append(obs)
    except Exception as e:
        print(e)
        print(url+"&page="+str(page))
        continue

1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [87]:
table = dict()
for record in result["results"]:
    if record["taxon"]["id"] not in table.keys():
        table[record["taxon"]["id"]] = dict()
    table[record["taxon"]["id"]]["taxon_name"] = record["taxon"]["name"]
    # for photo in record["observation_photos"]:
    #    if "photos" not in table[record["taxon"]["id"]].keys():
    #        table[record["taxon"]["id"]]["photos"] = []
    #    table[record["taxon"]["id"]]["photos"].append(photo["photo"]["url"])

to_verify = []
for taxon in table.keys():
    to_verify.append(table[taxon]['taxon_name'])


In [116]:
from urllib.parse import quote
def verify_wikidata(taxon_names):
    verified = []
    i = 1
    for chunks in [taxon_names[i:i + 50] for i in range(0, len(taxon_names), 50)]:
        names=" ".join('"{0}"'.format(w.replace("\"", "")) for w in chunks)
        query = f"""
             SELECT DISTINCT ?taxon_name (COUNT(?item) AS ?item_count) (COUNT(?article) AS ?article_count)   WHERE {{
                VALUES ?taxon_name {{{names}}} 
                {{ ?qitem wdt:P225 ?taxon_name .
                   ?item p:P31 [
                                ps:P31 wd:Q55983715 ;
                                pq:P642 ?qitem ;
                            ] .
                  OPTIONAL {{?article schema:about ?item ;
                         schema:isPartOf 	<{"https://"+language+".wikipedia.org/"}> }}
                }}
                UNION
                {{?item wdt:P225 ?taxon_name .
                  OPTIONAL {{?article schema:about ?item ;
                         schema:isPartOf 	<{"https://"+language+".wikipedia.org/"}> }}
                }}
                UNION 
                {{?item wdt:P566 ?basionym ;
                          wdt:P225 ?taxon_name .
                OPTIONAL {{?article schema:about ?basionym ;
                         schema:isPartOf 	<{"https://"+language+".wikipedia.org/"}> }}
                }}
                UNION
                {{
                  ?item wdt:P225 ?taxon_name .
                  ?wikidata_item wdt:P460 ?item .
                  OPTIONAL {{?article schema:about ?wikidata_item ;
                         schema:isPartOf 	<{"https://"+language+".wikipedia.org/"}> }}}}
           }} GROUP BY ?taxon_name  
            """
        try:
            results = wdi_core.WDFunctionsEngine.execute_sparql_query(query)
            
            for result in results["results"]["bindings"]:
                #print(result["taxon_name"], result["article_count"]["value"])
                if result["article_count"]["value"]=='0':
                    verified.append(result["taxon_name"]["value"])
        except Exception as e:
            print(url)
            print(e)
            continue
    return verified

In [113]:
len(to_verify)

6895

In [117]:
verified = verify_wikidata(to_verify)
verified

['Streptopelia senegalensis',
 'Cornu aspersum',
 'Rhinella alata',
 'Corvus monedula',
 'Veronica urticifolia',
 'Pseudofumaria lutea',
 'Prenanthes purpurea',
 'Eucnide lobata',
 'Polytrichum formosum',
 'Pentanema squarrosum',
 'Anthomyia illocata',
 'Ortilia ithra',
 'Bryonia cretica',
 'Knautia dipsacifolia',
 'Baccharis trimera',
 'Rhopalapion longirostre',
 'Solidago chilensis',
 'Hydrobates melania',
 'Chlorestes julie',
 'Petasites paradoxus',
 'Orobanche gracilis',
 'Berberis aquifolium',
 'Jacobaea alpina',
 'Valeriana tripteris',
 'Euphaea ochracea',
 'Lactophrys triqueter',
 'Streptopelia chinensis',
 'Diplacus aurantiacus',
 'Urile pelagicus',
 'Polyura bharata',
 'Austroeupatorium inulifolium',
 'Coccothraustes vespertinus',
 'Pachliopta aristolochiae',
 'Palicourea tomentosa',
 'Hydrobates homochroa',
 'Camponotus rufipes',
 'Lissachatina fulica',
 'Sibovia sagata',
 'Pontederia crassipes',
 'Serica brunnea',
 'Araniella alpica',
 'Pentanema verbascifolium',
 'Diclipter

In [118]:
len(verified)

911

In [132]:
import time
for taxon_name in verified:
    time.sleep(1)
    try:
        photos_url = f"https://api.inaturalist.org/v1/observations?photo_license={license}&project_id={project}&taxon_name={taxon_name}&order=desc&order_by=created_at"
        results = json.loads(requests.get(photos_url).text)
        for record in results["results"]:
            if record["taxon"]["id"] not in table.keys():
                continue
            for photo in record["observation_photos"]:
                if "photos" not in table[record["taxon"]["id"]].keys():
                    table[record["taxon"]["id"]]["photos"] = []
                table[record["taxon"]["id"]]["photos"].append(photo["photo"]["url"])
    except Exception as e:
        print(e)
        print(url+"&page="+str(page))
        


In [133]:
table

{48484: {'taxon_name': 'Harmonia axyridis'},
 3017: {'taxon_name': 'Columba livia'},
 4940: {'taxon_name': 'Egretta thula'},
 47219: {'taxon_name': 'Apis mellifera'},
 13858: {'taxon_name': 'Passer domesticus'},
 258813: {'taxon_name': 'Clogmia albipunctata'},
 49133: {'taxon_name': 'Vanessa atalanta'},
 4956: {'taxon_name': 'Ardea herodias'},
 51702: {'taxon_name': 'Coccinella septempunctata'},
 55925: {'taxon_name': 'Geranium robertianum'},
 144455: {'taxon_name': 'Ardea alba'},
 56065: {'taxon_name': 'Galium odoratum'},
 199840: {'taxon_name': 'Haemorhous mexicanus'},
 4954: {'taxon_name': 'Ardea cinerea'},
 3751: {'taxon_name': 'Eudocimus albus'},
 6317: {'taxon_name': 'Calypte anna'},
 133034: {'taxon_name': 'Daphne mezereum'},
 56061: {'taxon_name': 'Alliaria petiolata'},
 6930: {'taxon_name': 'Anas platyrhynchos'},
 62060: {'taxon_name': 'Palomena prasina'},
 55766: {'taxon_name': 'Euphorbia cyparissias'},
 5206: {'taxon_name': 'Buteo lineatus'},
 5212: {'taxon_name': 'Buteo jam

In [None]:
missing_en_wp_by_user = pd.DataFrame(columns=["taxon", "iNaturalist_taxon_id", "iNaturalist_observation_id", "reviewers"])

for observation in result["results"]:
    if observation["taxon"]["name"] in verified:
        missing_en_wp_by_user.loc[len(missing_en_wp_by_user)] = [observation["taxon"]["name"], observation["taxon"]["id"], observation["id"], len(observation["reviewed_by"])]
missing_en_wp_by_user

In [None]:
import shutil
shutil.rmtree('./new_articles/')
os.makedirs(os.path.dirname('./new_articles/'), exist_ok=True)

In [134]:
for taxon in table.keys():
    if table[taxon]["taxon_name"] in verified:
        if "photos" in table[taxon].keys():
            mdFile = mdutils.MdUtils(file_name='new_articles/'+table[taxon]["taxon_name"].replace(" ", "_"),title=table[taxon]["taxon_name"])
            mdFile.new_line(f"[iNaturalist taxon id: {taxon}](https://www.inaturalist.org/taxa/{taxon})")
            for image in table[taxon]["photos"]:
            mdFile.new_line(f"![{table[taxon]['taxon_name']}]({image.replace('square', 'medium')})")
        mdFile.create_md_file()

KeyError: 'photos'

In [None]:
import yaml

with open(r'_toc.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    species = yaml.load(file, Loader=yaml.FullLoader)

species["parts"][0]["chapters"] = []
for file in os.listdir('./new_articles/'):
    species["parts"][0]["chapters"].append({'file': 'new_articles/'+file})
with open(r'_toc.yml', 'w+') as file:
    documents = yaml.dump(species, file)

In [None]:
from ghp_import import ghp_import
ghp_import("_build/html", nojekyll=True, push=True, force=True)