# Add topics to scholarly articles on Wikidata

The goal of this bot is to add main subject (P921) to scholarly articles.

The bot search in scholarly articles' labels for a list of keywords. Each keyword is associated with an item that can be assumed to be one of the main subjects when found in the title of the publication.

For example :

- An article with "Stainless Steel" in the title or description is assumed to have a main subject (P921) → stainless steel (Q172587)
- An article with "Ebola virus" in the title or description is assumed to have a main subject (P921) → Ebola virus (Q10538943)

The metadata of scholarly articles in Wikipedias are virtually impossible to maintain by hand because the rate of creation of these articles exceed the capacity and willingness of the community. So that adding such automation is the only way we can maintain these data.

The code is released under CC BY SA. Author is Thibdx.
Feel free to fork and adapt the code if you know what you do and already have a bot account.

More on : https://www.wikidata.org/wiki/Wikidata:AddScholarTopics_Bot

*NB : sorry for the heavy 'string {}'.format(var) instead of f'string {var}', the bot has been adapted to run on older Python on Toolforge.*

In [None]:
import pywikibot
from pywikibot import pagegenerators as pg
from bs4 import BeautifulSoup
import re
import datetime


wikidata_site = pywikibot.Site("wikidata", "wikidata")
wikidata_repo = wikidata_site.data_repository()

# Global variables. Shall not be changed.
Pid = 'P921'  # This is the property ID of "main topic"
bot_page = 'Wikidata:AddScholarTopics_Script' # Where the user contributed dict is stored
topics_dict_id = 'topics_dict' #The id of the pre bloc that contains the topics dict
exclusions_dict_id = 'exclusions_dict' #The id of the pre bloc that contains the exclusions dict
regex_dict_id = 'regex_dict' #The id of the pre bloc that contains the regex dict
type_filter = 'haswbstatement:P31=Q13442814' # Filters the articles that are scolarly articles. It may be enlarged to other scientific publications in the future.

In [None]:
# We want to retrieve topics_dict and exclusion_dict from : https://www.wikidata.org/wiki/Wikidata:AddScholarTopics_Bot 
# We use BeautifullSoup to search for the text : https://www.crummy.com/software/BeautifulSoup/bs4/doc/

def get_dict(bot_page,dict_id):
    
    '''
    This function gather the dicts from a Wikidata page @bot_page.
    
    Here is the structure of the topic dict :
    It reads : Items having low dose naltrexone in the title are assumed to have Q5259325 as a main topic.

    topics_dict = {
        'low dose naltrexone':'Q5259325',
        'Behçet disease':'Q911427',
        'Ehlers Danlos':'Q1141499',
        'fibromyalgia':'Q540571',
    }

    Here is the structure of an exclusion list :
    It reads : Scolarly article Q46788624 does not have Q797668 or Q5384031 as main topics.
    This may be usefull in very spêcific cases.
    
    exclusions_dict = {
        'Q46788624':['Q797668','Q5384031'] 
    }
    '''
    
    # The lists are user contributed in this Wikidata page.
    # Alaways verifiy the consitency of new data added to the lists before launching the bot.
    page = pywikibot.Page(wikidata_site, bot_page) 
    
    print('\nTrying to get the dict at id = \"{dict_id}\" in {bot_page}'.format(dict_id = dict_id, bot_page = bot_page))
    
    if page.exists():
        page_txt = page.text
        soup = BeautifulSoup(page_txt,'lxml')
        
        # The dicts are located inside <pre id=...> </pre> tags.
        try:
            topics_dict_txt = soup.find('pre' , attrs = { 'id' : dict_id }).text        
        # If soup.find or eval fail, the bot is stopped. 
        # If you recieve this message please double check the page's wikicode.
        except:
            print('\n!!!!!!!!!!!! \nFound {} but not the dict\n'.format(bot_page))
            return False
        
        # check if the dict can be evaluated
        try:
            print('\nFound bot\'s page with content :\n{}'.format(topics_dict_txt))
            return eval(topics_dict_txt) 
        except:
            print('\n!!!!!!!!!!!! \nFound {} & dict, but syntax is invalid...\n'.format(bot_page))
            return False
    
    # If the page is not found, it may have been moved or the variable may be settled to a wrong page.
    else :
        print('\n!!!!!!!!!!!! \nCould not find {}\n'.format(bot_page))
        return False

#Test
#print(get_dict(bot_page,topics_dict_id))
#print(get_dict(bot_page,exclusions_dict_id))

In [None]:
# The following query gather scholarly articles with a keyword in the label.
# Only articles that don't yet have the associated main topic are gathered.

def get_articles(keyword,target_Qid,Pid,type_filter):
    
    """
    This function get the articles that have :
        - @keyword in the title 
        - and where @property_id is not yet settled to @Qid
        
    @keyword is a string (ie. fibromyalgia)
    @target_Qid is a string (ie. Q12345)
    @Pid is a string (ie. P921)
    """
  
    query = '''
    SELECT DISTINCT ?item ?itemLabel 
      WHERE {{
        hint:Query hint:optimizer "None".
        SERVICE wikibase:mwapi {{
        bd:serviceParam wikibase:api "Search";
                      wikibase:endpoint "www.wikidata.org";
                      mwapi:srsearch "{keyword} {type_filter}".
        ?title wikibase:apiOutput mwapi:title.
      }}
    BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item)
    FILTER NOT EXISTS {{ ?item wdt:{Pid} wd:{target_Qid}. }}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} 
    }}
    '''.format(
        keyword = keyword, 
        type_filter = type_filter,
        Pid = Pid, 
        target_Qid = target_Qid
    )
    
    print('Getting articles with \"{keyword}\" in the label but {Pid} not yet settled to {target_Qid}...'.format(
        keyword = keyword, 
        Pid = Pid, 
        target_Qid = target_Qid
    ))
    
    generator = pg.WikidataSPARQLPageGenerator(query, site=wikidata_site)
    
    print('Generator generated...')
    
    return generator
    
#Testing        
#test_generator = get_articles('Ehlers Danlos','Q1141499',Pid,type_filter)
#for item in test_generator :
#    print(str(item) + ' - ' + str(item.get()['labels']['en']))

In [None]:
def is_not_excluded(item,target_Qid,exclusions_dict):
    '''
    This function checks if the item/target pair is in the exlusion list.
    If return True if the item/target pair is not in the list.
    Else, it return false.
    '''
    item_Qid = item.id
    
    if item_Qid in exclusions_dict:
        for Qid in exclusions_dict[item_Qid]:
            if Qid == target_Qid:
                print('{target_Qid} is in exclusions dict and will not be added to {item_Qid}.'.format(target_Qid = target_Qid, item_Qid = item_Qid))
                return False
        return True
    
    else:
        return True
        

In [None]:
def get_item_label(item,lang = 'en'):
    item.get()  # you need to call it to access any data.
    if lang in item.labels:
        return item.labels[lang]
    else:
        return 'No label in {}'.format(lang)

In [None]:
# import re # for testing ony this cell

def error_shield(item_label,keyword,regex_dict):
    keyword = keyword.lower()
    if keyword in regex_dict :
        for regex in regex_dict[keyword] :
            matching = re.search(regex.lower(), item_label.lower())
            if matching == None :
                return False
            else :
                return True
    else :
        matching = re.search(keyword, item_label.lower())
        if matching == None :
            return False
        else :
            return True

# Test
# item_label = 'Il y a des vitamines dans la bananne'
# keyword = 'Vitamin B'
# regex_dict = { 'vitamin b':['vitamine?s?[ \-_]?b[1-9]*','b[1-9]*[ \-_]?vitamine?s?'] }
# error_shield(item_label,keyword,regex_dict)


In [None]:

def set_claim_item(item,Pid,target_Qid,log_file):
    
    """
    This is a generic function to settle a claim when the property type is item.
    @item is an pywikibot.ItemPage object
    @Pid is a string (ie: P921)
    @target_Qid is a string (ie: Q12345)
    """
    
    claim = pywikibot.Claim(wikidata_repo, Pid)
    
    target_item = pywikibot.ItemPage(wikidata_repo, target_Qid, 0)
    claim.setTarget(target_item)
    
    item_label = get_item_label(item)
    target_item_label = get_item_label(target_item)
    
    log_dict = {
        'item_id' : str(item.id),
        'item_label' : str(get_item_label(item)),
        'property_id' : str(Pid),
        'target_item_id' : str(target_item.id),
        'target_item_label' : str(get_item_label(target_item)),
        'time' : str(datetime.datetime.now()),
    }
    
    #message = 'Item ' + str(item.id) + ' (' + str(item_label) + ')' + ' > setting ' + str(Pid) + ' to ' + str(target_item.id) + ' (' + str(target_item_label) + ')'
    message = 'Item {item_id} ( {item_label} ) > setting {property_id} to {target_item_id} ( {target_item_label} )'.format(
        item_id = log_dict['item_id'],
        item_label = log_dict['item_label'],
        property_id = log_dict['property_id'],
        target_item_id = log_dict['target_item_id'],
        target_item_label = log_dict['target_item_label'],
    )
    
    item.addClaim(claim, summary=message)
    
    print(message)
    
    log_file.write('\n{}'.format(str(log_dict)))
    
    return claim

#Testing this on Wikidata sandbox
#wikidata_sandbox_item = pywikibot.ItemPage(wikidata_repo, 'Q4115189', 0)
#sandbox_id = 'Q842193'
#set_claim_item(wikidata_sandbox_item,Pid,sandbox_id,log_file)


In [None]:
def set_claims_for_generator(generator,keyword,Pid,target_Qid,exclusions_dict,regex_dict,log_file):
    
    """
    Set a claim with @Pid settled to @target_Qid for all items in the @generator.
    
    @generator : pg.WikidataSPARQLPageGenerator object
    @Pid : string (ex: P921)
    @target_Qid  string (ex: Q12345)
    """

    i=0
    for item in generator:
        i+=1
        #print(item) #uncomment this for debug only 
        if is_not_excluded(item,target_Qid,exclusions_dict) :
            item_label = get_item_label(item)
            if error_shield(item_label,keyword,regex_dict) :
                set_claim_item(item,Pid,target_Qid,log_file)
            else :
                print('The keyword {} or its defined regexs are not in the label. Skipping...'.format(keyword))
        else :
            print('{} is in exclusions dict and will not be added.'.format(target_Qid))
    
    print('{i} claims were setteled with {Pid} to {target_Qid}'.format(i = i, Pid = Pid, target_Qid = target_Qid))
    return i   
    #beware, after a for loop, a generator is not usable anymore.
    
# Testing (please interrupt the operation before the end if many items are to be settled)
# keyword = 'Ehlers Danlos'
# target_Qid = 'Q1141499'
# generator = get_articles(keyword,target_Qid,Pid)
# set_claims_for_generator(generator,Pid,target_Qid,exclusion_dict)

In [None]:
def add_scholar_topics(Pid,bot_page,type_filter):
    
    """
    This is the main function of this script.
    It iterates over the dict + over SPARQL results and add the main topics for each article
    
    @topict_dict is a dicitonary : {keyword:target_Qid,}
    
    """
    topics_dict = get_dict(bot_page,topics_dict_id)
    exclusions_dict = get_dict(bot_page,exclusions_dict_id)
    regex_dict = get_dict(bot_page,regex_dict_id)
    log_file = open("addScholarTopics.log", "a")
    
    i=0
    for keyword in topics_dict:
        i+=1
        target_Qid = topics_dict[keyword]
        generator = get_articles(keyword,target_Qid,Pid,type_filter)
        
        set_claims_for_generator(generator,keyword,Pid,target_Qid,exclusions_dict,regex_dict,log_file)
    
    print('\nDone ! {} keyword/topic pairs were checked and settled...'.format(i))
    
    log_file.close()
    
    return i

In [None]:
#OK... Let's run the script !

add_scholar_topics(Pid,bot_page,type_filter)