#### This NB downloads the full text data from Elsevier API using the list of DOIs obtained from CrossRef. Note that you need to have your API credentials in a config.json file.

In [2]:
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
import pandas as pd
import random
import re

from datasets import load_dataset

In [3]:
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client`
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']

In [3]:
def create_dataset(doi,outfile):
    doi_doc = FullDoc(doi = doi)
    
    if doi_doc.read(client):
        try:
            title = doi_doc.title
        except:
            title=""
        try:
            abstract = doi_doc.data['coredata']['dc:description']
        except:
            abstract=""
            
        try:
            text = doi_doc.data['originalText']
        except:
            text=None

        #some articles start with Introduction, some start with 1 Introduction
        if isinstance(text,str):
            if '1 Introduction' not in text:
                introduction=text.find('Introduction',text.find('Introduction')+1)
            else:
                introduction=text.find('1 Introduction',text.find('1 Introduction')+1)

        
        # Section number for the Conclusion(s) section varies, so used regex to catch those.
        # +-750 is a heuristic number. Wanted to capture a few sentences in the conclusion section.
            try:
                regex=re.compile(r'\d+\sConclusions')
                conclusions_occurence=regex.findall(text)[0]
                conclusions_start=text.find(conclusions_occurence,text.find(conclusions_occurence)+1)
                filtered_text=text[introduction:conclusions_start+750].strip()
                # print('regex 1 found')
            except:
                try:
                    regex=re.compile(r'\d+\sConclusion')
                    conclusions_occurence=regex.findall(text)[0]
                    conclusions_start=text.find(conclusions_occurence,text.find(conclusions_occurence)+1)
                    filtered_text=text[introduction:conclusions_start+750].strip()
                    # print('regex 2 found')

        #If conclusions section is not present, use References [1] or References to get the end of the article
                except:
                    if 'References [1]' not in text:
                        reference_start = text.find('References',text.find('References')+1)
                    else:
                        reference_start = text.find("References [1]")
                    filtered_text = text[introduction:reference_start-1000].strip()
                    # print('regex NOT found')
            
            json.dump({'title': title, 'abstract': abstract, 'text': filtered_text,'doi':doi}, outfile)
        else:
            filtered_text="Text is not string."
            json.dump({'title': title, 'abstract': abstract, 'text': filtered_text,'doi':doi}, outfile)
          
    else:
        filtered_text="Read document failed."
        json.dump({'title': title, 'abstract': abstract, 'text': filtered_text,'doi':doi}, outfile)


#### Here we are reading the list of DOIs and the code below downloads the full text articles and loads in a json file.

In [4]:
df_2023=pd.read_csv('./DOI_Elsevier_magnetic/2023_Elsevier_data.csv')
DOI_list=df_2023['DOI'].to_list()

In [303]:
#creates a "./data/2023_magnetic_corpus.jsonl" file.
count=1
with open('data/2023_magnetic_corpus.jsonl', 'w') as outfile:
    for doi in DOI_list:
        with open("data/2023_magnetic_corpus_progress.txt","a") as file:
            file.write(f"working on doi:{doi}, step {count}/{len(DOI_list)} \n")
        count+=1
        
        create_dataset(doi, outfile)