This script allow to get all the papers in openAlex that are including one or more keywords (in a list) in their title or abstract.
The keywords are considered here by ngrams.

In [1]:
import requests
import pandas as pd
from tqdm import tqdm

# open keywords database (AI)

In [7]:
AIkwclass = pd.read_csv('/../classification.txt',sep=';',names=['keyword','class'])
AIkwclass

Unnamed: 0,keyword,class
0,evolutionary algorithm,optimization
1,simulated annealing,optimization
2,hierarchical clustering,cluster analysis
3,principal component analysis,cluster analysis
4,unsupervised learning,machine learning
...,...,...
527,multi swarm optimization,optimization
528,fast and frugal trees,AI general
529,chess playing computer,robotics
530,bias variance tradeoff,optimization


# scrape papers for one keyword/ngram

In [2]:
def add_request_abstract(string, ngram):
    if string[-1] != '=':
        return string + ',abstract.search:' + ngram.replace(' ','%')
    else:
        return string + 'abstract.search:' + ngram.replace(' ','%')
    
def add_request_title(string, ngram):
    if string[-1] != '=':
        return string + ',title.search:' + ngram.replace(' ','%')
    else:
        return string + 'title.search:' + ngram.replace(' ','%')
    
def add_request_fulltext(string, ngram):
    if string[-1] != '=':
        return string + ',fulltext.search:' + ngram.replace(' ','%')
    else:
        return string + 'fulltext.search:' + ngram.replace(' ','%')
    
def add_publicationYear(string, year):
    if string[-1] != '=':
        return string + f',publication_year:{year}'
    else:
        return string + f'publication_year:{year}'
    
# from the key 'abstract_inverted_index' key per paper
def rebuild_abstract(word_dict):
    if word_dict != None:
        # search the highest index
        max_index = 0
        for ind in word_dict.values():
            local_max = max(ind)
            if local_max > max_index:
                max_index = local_max
        abstract = [0]*(max_index+1)
        for i in word_dict.keys():
            for j in word_dict[i]:
                abstract[j] = i
        abstract = ' '.join(abstract)
        return abstract
    else:
        return('')

In [3]:
is_work = 'https://api.openalex.org/works?'
authentification = 'mailto=myemail@adress.com'
# need to have a doi, an abstract and at least one reference
filtering = 'filter=has_doi:true,has_abstract:true,has_references:true'
results_per_page = 'per-page=100'
cursor = 'cursor=*'

In [4]:
baseURL = '&'.join([is_work,
                    authentification,
                    results_per_page,
                    cursor,
                    filtering])
baseURL = add_request_abstract(baseURL, 'chess playing computer')
baseURL = add_request_title(baseURL, 'chess playing computer')
baseURL = add_publicationYear(baseURL, 2020)

In [5]:
baseURL

'https://api.openalex.org/works?&mailto=myemail@adress.com&per-page=100&cursor=*&filter=has_doi:true,has_abstract:true,has_references:true,abstract.search:chess%playing%computer,title.search:chess%playing%computer,publication_year:2020'

In [6]:
#request for one keyword
cc = requests.get(baseURL).json()

In [9]:
ccdf = pd.DataFrame(cc['results'])

In [16]:
ccdf['abstract_inverted_index'][0]

{'It': [0],
 'is': [1],
 'been': [2, 22],
 '46': [3],
 'years': [4],
 'since': [5],
 'the': [6, 44, 72, 78],
 'first': [7],
 'computer': [8],
 'chess': [9],
 'world': [10],
 'championship': [11],
 'was': [12],
 'held': [13],
 'in': [14, 16],
 'Stockholm': [15],
 '1974,': [17],
 'and': [18],
 'there': [19],
 'have': [20, 75],
 'now': [21],
 '24': [23],
 'more': [24],
 'with': [25],
 'over': [26, 77],
 '700': [27],
 'games': [28],
 'played.': [29],
 'This': [30],
 'paper': [31],
 'investigates': [32],
 'a': [33, 38],
 'number': [34],
 'of': [35, 46, 71],
 'questions': [36],
 'that': [37],
 'curious': [39],
 'soul': [40],
 'might': [41],
 'ask': [42],
 'regarding': [43],
 'results': [45],
 'these': [47],
 'games,': [48],
 'especially': [49],
 'how': [50, 69],
 'they': [51],
 'ended.': [52],
 'The': [53],
 'data': [54, 67],
 'provided': [55],
 'herein': [56],
 'attempts': [57],
 'to': [58],
 'answer': [59],
 'several': [60],
 'related': [61],
 'questions.': [62],
 'In': [63],
 'addition,':

In [15]:
rebuild_abstract(ccdf['abstract_inverted_index'][0])

'It is been 46 years since the first computer chess world championship was held in Stockholm in 1974, and there have now been 24 more with over 700 games played. This paper investigates a number of questions that a curious soul might ask regarding the results of these games, especially how they ended. The data provided herein attempts to answer several related questions. In addition, it presents data showing how ratings of the top engines have increased over the years.'

# scrape papers for mutiple keywords/ngrams

In [51]:
is_work = 'https://api.openalex.org/works?'
authentification = 'mailto=myemail@adress.com'
# need to have a doi, an abstract and at least one reference
filtering = 'filter=has_doi:true,has_abstract:true,has_references:true'
results_per_page = 'per-page=100'

In [68]:
kw_list = [i[1:] for i in AIkwclass['keyword']]

In [81]:
papers = []
for kw in tqdm(kw_list):
    cursor = '*'
    l = 1
    while (cursor != None) & (l != 0):
        baseURL = '&'.join([is_work,
                        authentification,
                        results_per_page,
                        f'cursor={cursor}',
                        filtering])
        baseURL = add_request_abstract(baseURL, kw)
        baseURL = add_request_title(baseURL, kw)
        #baseURL = add_request_fulltext(baseURL, kw)
        response = requests.get(baseURL).json()
        papers.extend(response['results'])
        cursor = response['meta']['next_cursor']
        l = len(response['results'])
    # break # for one keyword testing

  0%|          | 0/532 [01:36<?, ?it/s]
