In [1]:
import numpy as np
import pandas as pd
import re
import csv
import json

# Importing publications

The publications from each facility aare imported.\
The publications were downloaded from the databases of the various facilities.\
For HZB, an API call is used to download the publications\

ESRF/ILL: https://epn-library.esrf.fr/ \
DLS: https://publications.diamond.ac.uk/ \
HZB: https://www.helmholtz-berlin.de/pubbin/publikationen.pl \

Only the ESRF section of the code is complete; the rest are work in progress.

## ESRF

There are four types of ESRF publications. We shall designate them as follows:
<br>
Type 1: Publications with ESRF authors and describing ESRF experiments
<br>
Type 2: Publications without ESRF authors and describing ESRF experiments
<br>
Type 3: Publications with ESRF authors and not describing ESRF experiments
<br>
Type 4: Articles citing the ESRF, no ESRF authors
<br>
This 'Type' information willl be included alongside the other metadata in the dataframes

In [None]:
# Import all ESRF publication files
pub_esrf_type1=pd.read_csv('{insert path here}/Datasets/ESRF/ESRF publications with ESRF authors and describing ESRF experiment (Oct 2024).csv',sep='	', encoding='utf-8',skiprows=1,header=0)    # Import file
pub_esrf_type2=pd.concat([pd.read_csv('{insert path here}/Datasets/ESRF/ESRF Publications without ESRF authors and describing an ESRF experiment (2015 onwards).csv',sep='	', encoding='utf-8',skiprows=1,header=0),pd.read_csv('Datasets/ESRF/ESRF Publications without ESRF authors and describing an ESRF experiment (bef 2015).csv',sep='	', encoding='utf-8',skiprows=1,header=0)],ignore_index=True)    # Import file
pub_esrf_type3=pd.read_csv('{insert path here}/Datasets/ESRF/ESRF Publications with ESRF authors and not describing an ESRF experiment.csv',sep='	', encoding='utf-8',skiprows=1,header=0)    # Import file
pub_esrf_type4=pd.read_csv('{insert path here}/Datasets/ESRF/ESRF articles citing ESRF, no ESRF author.csv',sep='	', encoding='utf-8',skiprows=1,header=0)    # Import file

# Add 'Type' column
pub_esrf_type1['Type']=1
pub_esrf_type2['Type']=2
pub_esrf_type3['Type']=3
pub_esrf_type4['Type']=4

# Concatenate all four types
pub_esrf=pd.concat([pub_esrf_type1,pub_esrf_type2,pub_esrf_type3,pub_esrf_type4],ignore_index=True)

# Add 'Facility repository' column
pub_esrf['Facility repository']='ESRF'

In [None]:
# Export DataFrame to csv file
pub_esrf.to_csv('{insert path here}/Publications_ESRF',index=False)

## ILL

There are four types of ILL publications. We shall designate them as follows:
<br>
Type 1: Publications with ILL authors and describing ILL experiments
<br>
Type 2: Publications without ILL authors and describing ILL experiments
<br>
Type 3: Publications with ILL authors and not describing ILL experiments
<br>
Type 4: Articles citing the ILL, no ILL authors
<br>
This 'Type' information willl be included alongside the other metadata in the dataframes

In [None]:
# Import all ILL publication files
pub_ill_type1=pd.read_csv('{insert path here}/Datasets/ILL/Publications with ILL authors and describing ILL experiment.txt',sep='	', encoding='latin-1',skiprows=1,header=0)    # Import file
pub_ill_type2=pd.read_csv('{insert path here}/Datasets/ILL/Publications without ILL author and describing an ILL experiment.txt',sep='	', encoding='latin-1',skiprows=1,header=0)    # Import file
pub_ill_type3=pd.read_csv('{insert path here}/Datasets/ILL/Publications with ILL authors and not describing ILL experiment.txt',sep='	', encoding='latin-1',skiprows=1,header=0)    # Import file
pub_ill_type4=pd.read_csv('{insert path here}/Datasets/ILL/Articles citing the ILL, no ILL author.txt',sep='	', encoding='latin-1',skiprows=1,header=0)    # Import file

# Add 'Type' column
pub_ill_type1['Type']=1
pub_ill_type2['Type']=2
pub_ill_type3['Type']=3
pub_ill_type4['Type']=4

# Concatenate all four types
pub_ill=pd.concat([pub_ill_type1,pub_ill_type2,pub_ill_type3,pub_ill_type4],ignore_index=True)

# Add 'Facility repository' column
pub_ill['Facility repository']='ILL'

## DLS

In [None]:
# Import and concat all DLS publications
pub_dls=pd.concat([pd.read_csv('{insert path here}/Datasets/DLS/DLS annual review highlight.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS book chapter.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS conference paper.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS editor note.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS journal paper (2002-2010).csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS journal paper (2011-2020).csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS journal paper (2021-2024).csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS magazine article.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS poster.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS report.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS science highlight.csv'),pd.read_csv('{insert path here}/Datasets/DLS/DLS thesis.csv')],ignore_index=True)

# Add 'Facility repository' column
pub_dls['Facility repository']='DLS'

## HZB

In [487]:
import requests
from bs4 import BeautifulSoup
import os
import time

# Base URL for the form submission
# search_url = "https://www.helmholtz-berlin.de/pubbin/publikationen.pl"
# search_url = "https://www.helmholtz-berlin.de/pubbin/publikationen.pl"

# Years for which to download metadata
years = list(range(1981, 2025))  # Adjust the range of years as needed

# Directory to save metadata
save_directory = 'Datasets/HZB'
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Function to extract metadata for a specific year
def extract_metadata_by_year(year):
    print(f"Searching for metadata in year {year}...")
    
    # Form data to submit for the search
    form_data = {
        'jahr': str(year),  # Year
        'JOB': 'start search',  # Start the search
        'sprache':'en',
        'typ_1':'1',
        'typ_2':'1',
        'typ_3':'1',
        'typ_5':'1',
    }

    # Submit the form with a POST request
    response = requests.post(search_url, data=form_data)
    
    if response.status_code != 200:
        print(f"Error: Failed to retrieve results for year {year}")
        return

    # Parse the response HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the metadata section (you'll need to inspect the HTML structure to find the right tag)
    # For example, it might be in a specific <div>, <table>, or other tag
    metadata_text = soup.get_text()  # Get all the text on the page
    # Alternatively, you might want to focus on specific areas using soup.find() or soup.select()

    # Save the metadata to a text file
    metadata_filename = os.path.join(save_directory, f'metadata_{year}.txt')
    with open(metadata_filename, 'w', encoding='utf-8') as f:
        f.write(metadata_text)
    
    print(f"Metadata for year {year} saved to {metadata_filename}")

# Main loop to iterate over years
for year in years:
    extract_metadata_by_year(year)
    time.sleep(5)  # Add a delay between requests


Searching for metadata in year 1981...
Metadata for year 1981 saved to Datasets/HZB/metadata_1981.txt
Searching for metadata in year 1982...
Metadata for year 1982 saved to Datasets/HZB/metadata_1982.txt
Searching for metadata in year 1983...
Metadata for year 1983 saved to Datasets/HZB/metadata_1983.txt
Searching for metadata in year 1984...
Metadata for year 1984 saved to Datasets/HZB/metadata_1984.txt
Searching for metadata in year 1985...
Metadata for year 1985 saved to Datasets/HZB/metadata_1985.txt
Searching for metadata in year 1986...
Metadata for year 1986 saved to Datasets/HZB/metadata_1986.txt
Searching for metadata in year 1987...
Metadata for year 1987 saved to Datasets/HZB/metadata_1987.txt
Searching for metadata in year 1988...
Metadata for year 1988 saved to Datasets/HZB/metadata_1988.txt
Searching for metadata in year 1989...
Metadata for year 1989 saved to Datasets/HZB/metadata_1989.txt
Searching for metadata in year 1990...
Metadata for year 1990 saved to Datasets/HZ

In [652]:
import re

# Function to extract relevant information from a text file
def extract_publications(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Regex pattern to capture text between 'select none' and 'Export as'
    pattern = re.compile(r'select none(.*?)Export as', re.DOTALL)

    # Search for the pattern
    match = pattern.search(content)

    if match:
        # Extract the relevant section
        relevant_section = match.group(1)

        # Clean up the extracted section by stripping unnecessary whitespace
        relevant_section = relevant_section.strip()

        # # Remove multiple blank lines using regex
        # relevant_section = re.sub(r'\n\s*\n+', '\n', relevant_section)

        # Split the cleaned section into lines
        lines = relevant_section.splitlines()

        # publications = []
        # current_entry = []

        # # Process each line to group entries
        # for line in lines:
        #     stripped_line = line.strip()
        #     if stripped_line:  # Only process non-empty lines
        #         current_entry.append(stripped_line)

        #         # Heuristic: Check if the line indicates the end of an entry
        #         # For example, if it contains a DOI or a specific pattern.
        #         if re.search(r'\d{4}', stripped_line) or "doi:" in stripped_line.lower():
        #             # This assumes the entry ends after a line containing a year or DOI
        #             publications.append(" ".join(current_entry))
        #             current_entry = []  # Reset for the next entry

        # # Handle any remaining lines in current_entry
        # if current_entry:
        #     publications.append(" ".join(current_entry))
    else:
        return []
    
    return lines
        

# file_path = 'Datasets/HZB/metadata_2016.txt'  # Replace with your file path
# lines = extract_publications(file_path)




In [653]:
# Loop through all HZB files
publications_hzb=[]
for i in range(1981,2025):
    file_path = 'Datasets/HZB/metadata_'+str(i)+'.txt'  # Replace with your file path
    lines = extract_publications(file_path)
    publications_hzb=publications_hzb+lines

In [656]:
# Extract all DOIs from publications_hzb 
doi_pattern=r"doi:\s(?P<doi>\S+)(?=Open|$)"   # DOI starts with "doi:" followed by a string
hzb_doi=[]
for entry in publications_hzb:
    doi=re.search(doi_pattern,entry)
    if doi:
        hzb_doi.append(doi.group(1))
    else:
        pass

# Get Proposals 
Now, that we have the publications, the next step is to get the proposals.\
For ESRF, this can be done through the API

## ESRF Proposals
The ESRF publications imported above are linked to their corresponding proposals through the proposal number.\
We want to get a complete list of ESRF proposals and map them to the corresponding papers.\
https://icatplus.esrf.fr/api/Documents provides a list of experimental sessions, which can then be mapped to the associated proposals and them to the assosicated publications.\
Note that multiple experimental sessions can be associated with a single proposal.

In [2]:
import requests

In [66]:
# Import ESRF publications

# pub_esrf=pd.read_csv('{insert pathname}/Datasets/ESRF/Publications_ESRF')    # Import file
pub_esrf=pd.read_csv('/Users/fdp54928/Library/CloudStorage/OneDrive-Nexus365/GitHub Repositories/synchrotron-proposals-topic-classification/Datasets/ESRF/Publications_ESRF')    # Import file

In [67]:
# Remove any publications without DOIs

pub_esrf=pub_esrf[pub_esrf['DOI'].notna()]

In [65]:
# Get list of experimental session documents
api_request="https://icatplus.esrf.fr/api/Documents"
response=requests.get(api_request).json()
documents_esrf=response

In [68]:
# Extract DOI, Title, and Summary (abstract) information from documents_esrf into session_esrf
# session_esrf will be a list of dicts, with each index in the list representing a proposal; dict keys are: 'doi', 'title', 'summary'

session_esrf=[]        # Initialise empty session_esrf list

for document in documents_esrf:
    session_dict={}        # Initialise empty session_dict dictionary
    
    # Check for DOI and append those with one
    if document['doi']!=None:
        session_dict['doi']=document['doi']
        session_dict['title']=document['title']
        session_dict['summary']=document['summary']
        session_esrf.append(session_dict)
    else:
        pass

In [69]:
# Function to fetch Proposal number information of a single session using API call

def fetch_data(session):
    api_request="https://icatplus.esrf.fr/doi/"
    response2 = requests.get(api_request+session['doi']+'/reports')

    # Check for valid API call status code
    if response2.status_code==200:
        reports_esrf=response2.json()
        report_esrf=reports_esrf[0]
        session['proposal']=report_esrf['proposal']
        return session     # prop with proposal number added 
    else:
        return 0        # return 0 is API call result is invalid

In [70]:
# Loop through each session in session_esrf and apply the fetch_data function. Only append session that has valid proposal number to session_esrf_valid

session_esrf_valid=[]
for session in session_esrf:
    fetched_data=fetch_data(session)
    if fetched_data!=0:
        session_esrf_valid.append(session)
    else:
        pass

In [71]:
# Use Regular Expressions to match proposal number in session_esrf_valid to pub_esrf_valid

n = len(session_esrf_valid)
for i in range(n):
    if 'proposal' in session_esrf_valid[i]:
        pattern= r'(?i)'+session_esrf_valid[i]['proposal']
        match = pub_esrf[pub_esrf['Proposal number'].notna()]['Proposal number'].str.match(pattern)
        session_esrf_valid[i]['publications DOI']=pub_esrf[pub_esrf['Proposal number'].notna()][match]['DOI'].tolist()
    else:
        pass

In [None]:
# Convert session_esrf_valid to DataFrame, drop session DOI column, apply upper case to all proposal names for standardisation, drop proposal duplicates (recall that a single proposal can be linked to multiple sessions)
df=pd.DataFrame(session_esrf_valid)
df.drop(columns='doi', inplace=True)
df['proposal']=df['proposal'].str.upper()
df.drop_duplicates('proposal',inplace=True)

#  Export DataFrame to json file
df.to_json('{insert pathname here}/Datasets/ESRF/Proposals_ESRF',index=False)

# Get OpenAlex citation metadata for proposals
Now that we have the proposals with the DOIs of the publications linked to the proposals, we want to use the OpenAlex API call on the DOIs to get the corresponding OpenAlex IDs. This is because the topic classification model takes in OpenAlex IDs and not DOIs as input features.

In [None]:
# Import ESRF proposals
prop_esrf=pd.read_json('{insert pathname here}/Datasets/ESRF/Proposals_ESRF')

In [86]:
prop_esrf

Unnamed: 0,title,summary,proposal,publications DOI
0,mx415,TEST,MX-415,[]
1,mx1937,Munich Crystallography BAG,MX-1937,[]
2,mx1888,IBS BAG,MX-1888,[]
3,mx1936,Cambridge MRC Block allocation,MX-1936,[10.1038/nature25462]
4,mx1944,FRANKFURT/HOMBURG BAG: ATOMIC MECHANISMS OF AC...,MX-1944,[]
...,...,...,...,...
6911,Exploring Stability of Hydrogen-filled Ices fo...,Gas hydrates have received considerable attent...,HC-5914,[]
6912,Analysis of the antiferromagnetic structure in...,The antiferromagnetic structure of the Au-Al-T...,IH-HC-4055,[]
6914,Investigation of the healing potential of AlMg...,"In industry applications, parts are subjecte...",MA-6418,[]
6915,Investigating the Strain Variation in SiGeSn,Quantum Devices are often manufactured using s...,IH-MA-550,[]


In [87]:
import requests
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders, config
config.email = 'terence.tan@wadham.ox.ac.uk'

In [92]:
# Function for applying the API call to each row of the proposal DataFrame prop_esrf

def get_openalex_id(row):
    # Get the 'publications DOI' column from the row
    publications_doi = row['publications DOI']
    openalex_id = []
    
    for doi in publications_doi:
        # Assume Works() is properly imported and accessible
        result = Works().filter(doi=doi).select('id').get()
        # If the result is not empty, append the 'id', otherwise append 0
        if result:
            openalex_id.append(result[0]['id'])
        else:
            pass

    row['openalex_ids']=openalex_id
   
    return row

In [93]:
# Separate the propsals with publications and those without

prop_esrf_non_empty=prop_esrf[prop_esrf['publications DOI'].str.len()!=0]
prop_esrf_empty=prop_esrf[prop_esrf['publications DOI'].str.len()==0]

In [None]:
# Apply the get_openalex_id to each row of prop_esrf_non_empty
prop_esrf_non_empty=prop_esrf_non_empty.apply(get_openalex_id, axis=1)

In [95]:
# Sanity check
# Check if the number of publications DOI is equal to the number of OpenAlex IDs for each proposal

print("Number of proposals with different number of publications DOI and OpenAlex IDs:")
print(len(prop_esrf_non_empty[prop_esrf_non_empty['publications DOI'].apply(len)!=prop_esrf_non_empty['openalex_ids'].apply(len)]))

Number of proposals with different number of publications DOI and OpenAlex IDs:
1


There is a mismatch for one of the proposals, so we investigate why this is the case

In [99]:
prop_esrf_non_empty[prop_esrf_non_empty['publications DOI'].apply(len)!=prop_esrf_non_empty['openalex_ids'].apply(len)]

Unnamed: 0,title,summary,proposal,publications DOI,openalex_ids
2825,Release of metals and dissolution of mineral f...,Pathogenic mechanisms of absestos-related dise...,LS-3076,[10.13133/2239-1002/18090],[]


This particular publication DOI (10.13133/2239-1002/18090) does not exist in the OpenAlex data repository. We shall move this proposal from prop_esrf_non_empty to prop_esrf_empty.

In [None]:
# Move Row 2825 to prop_esrf_empty
prop_esrf_empty=pd.concat([prop_esrf_empty,prop_esrf_non_empty.loc[2825:2826]])

# Drop Row 2825 from prop_esrf_non_empty
prop_esrf_non_empty=prop_esrf_non_empty.drop(2825)

In [None]:
# Set all values in the 'openalex_ids' column to an empty list
prop_esrf_empty['openalex_ids']= np.empty((len(prop_esrf_empty), 0)).tolist()

In [None]:
# Save the proposals appended with the OpenAlex IDs and those without

prop_esrf_non_empty.to_json('{insert pathname here}/Datasets/ESRF/Proposals_ESRF_openalexID',index=True)
prop_esrf_empty.to_json('{insert pathname here}/Datasets/ESRF/Proposals_ESRF_no_openalexID',index=True)