In [1]:
# Import the OpenAlex model. Make sure the predictor.py file is in the same directory. Also, make sure to go into the file itself to set the path to the correct directories for the model weights etc.
import predictor

Loaded target vocab
Loaded inverse target vocab
Loaded citation features vocab.
Loaded gold citation mapping
Loaded gold citation L1
Loaded non-gold citation L1
Model initialized


In [2]:
import pandas as pd
import numpy as np
import json

# Classifying the proposals

Now that we have initialised the model, we import the proposal datafiles we got from the 'Data Processing' Jupyter notebook.\
We have to transform the proposal DataFrames to the specified format for input into the model.\
For proposals with no referenced works, we will set the input features as an empty list.\
Note that we will set the journal name to a blank string.

In [34]:
filepath = '/Users/fdp54928/Library/CloudStorage/OneDrive-Nexus365/GitHub Repositories/synchrotron-proposals-topic-classification/Datasets/ESRF/Checkpoint_data/Proposals_ESRF_combined.json'   # <--- INSERT YOUR FILEPATH HERE
df_combined=pd.read_json(filepath)

In [35]:
df_combined

Unnamed: 0,proposal,summary,title,subject,instrument,experiment_session_doi,pdf_document_name,referenced_works_doi,referenced_works_openalex_ids,publications_doi,publications_openalex_ids,combined_openalex_ids
0,A01-2-1247,The proposal falls within the general research...,Crystal structure of multiferroic KNi_1-xCo_xP...,,[BM01],[10.15151/ESRF-ES-670011307],[],[],[],[],[],[]
1,A01-2-1248,Having achieved successful results with metal-...,Metal-organic and covalent organic polyhedra f...,,[BM01],[10.15151/ESRF-ES-670011305],[],[],[],[],[],[]
2,A01-2-1249,The overall aim of the project is deciphering ...,The redox structure of haem- and flavoproteins...,,[BM01],[10.15151/ESRF-ES-670011413],[98064_A.pdf],[],[],[],[],[]
3,A01-2-1254,"In this study, we will investigate structural ...","Nickelates – phase transitions, distortions an...",,[BM01],[10.15151/ESRF-ES-748027553],[],[],[],[],[],[]
4,A01-2-1255,We developed a crystallization strategy that p...,Understanding the structure of two-dimensional...,,[BM01],[10.15151/ESRF-ES-670011338],[],[],[],[10.1038/s41563-023-01669-z],[https://openalex.org/W4386923995],[https://openalex.org/W4386923995]
...,...,...,...,...,...,...,...,...,...,...,...,...
5379,XA-11,Steel slag is one of the most common wastes pr...,Time-resolved X-ray Tomography imaging and Ram...,,[BM05],[10.15151/ESRF-ES-1647778275],[],[],[],[],[],[]
5380,XA-5,The aim of this proposal is to elucidate the i...,ReMade Proposal\r\nImpact of metals blend and ...,,[ID22],[10.15151/ESRF-ES-1424924468],[],[],[],[],[],[]
5381,XA-6,Recycling spent Li-ion batteries has attracted...,ReMade Proposal\r\nOperando investigation stru...,,[ID31],[10.15151/ESRF-ES-1436201044],[],[],[],[],[],[]
5382,XA-7,This proposals combines our expertise in metal...,ReMade Proposal\r\nTuning the sorption propert...,,[ID31],[10.15151/ESRF-ES-1352264747],[],[],[],[],[],[]


## Topic prediction with just PDF metadata
These predictions are possible for new proposals that are just submitted. This would be the most accurate scenario for a topic prediction live service at the beamtime application stage since only title, abstract, subject, and the PDF metadata are available at that point.

In [36]:
# Rename the column names
df1 = df_combined.rename(columns={'summary': 'abstract_inverted_index', 'referenced_works_openalex_ids':'referenced_works','subject':'journal_display_name' }, inplace=False)

In [37]:
# Replace all null values in the title and abstract columns with blank string
df1['abstract_inverted_index'].fillna('',inplace=True)
df1['title'].fillna('',inplace=True)

# Create training batch
batch=df1[['title','abstract_inverted_index','referenced_works','journal_display_name']].to_dict(orient='records')

# Set the input features
for dict in batch:
    dict['inverted']=False

Now that we have formatted the input data correctly, we can proceed with topic classification.

In [38]:
# Topic classification for proposals with referenced works
results=[]
for single in batch:
    result=predictor.full_model_prediction([single])
    results.append(json.loads(result)[0])

In [39]:
# Add the list of topic predictions as a dict value to the batch

n = len(batch)
for i in range(n):
    batch[i]['proposal_number']=df1['proposal'].to_list()[i]
    batch[i]['instrument']=df1['instrument'].to_list()[i]
    batch[i]['experiment_session_doi']=df1['experiment_session_doi'].to_list()[i]
    batch[i]['pdf_document_name']=df1['pdf_document_name'].to_list()[i]
    batch[i]['referenced_works_doi']=df1['referenced_works_doi'].to_list()[i]
    batch[i]['topic_predictions_pdf_metadata_only']=results[i]



In [40]:
# Export list to json format
filepath = '/Users/fdp54928/Library/CloudStorage/OneDrive-Nexus365/GitHub Repositories/synchrotron-proposals-topic-classification/Datasets/ESRF/Predictions/Proposals_ESRF_Predictions_PDF_metadata_only.json'
with open(filepath, 'w') as f:
    json.dump(batch, f, indent=2)

## Topic prediction with just publications

In [41]:
# Rename the column names
df2 = df_combined.rename(columns={'summary': 'abstract_inverted_index', 'publications_openalex_ids':'referenced_works','subject':'journal_display_name' }, inplace=False)

In [42]:
# Replace all null values in the title and abstract columns with blank string
df2['abstract_inverted_index'].fillna('',inplace=True)
df2['title'].fillna('',inplace=True)

# Create training batch
batch2=df2[['title','abstract_inverted_index','referenced_works','journal_display_name']].to_dict(orient='records')

# Set the input features
for dict in batch2:
    dict['inverted']=False

In [43]:
# Topic classification for proposals with publications
results2=[]
for single in batch2:
    result=predictor.full_model_prediction([single])
    results2.append(json.loads(result)[0])

In [44]:
# Add the list of topic predictions as a dict value to the batch

n = len(batch2)
for i in range(n):
    batch2[i]['proposal_number']=df2['proposal'].to_list()[i]
    batch2[i]['instrument']=df2['instrument'].to_list()[i]
    batch2[i]['experiment_session_doi']=df2['experiment_session_doi'].to_list()[i]
    batch2[i]['pdf_document_name']=df2['pdf_document_name'].to_list()[i]
    batch2[i]['publications_doi']=df2['publications_doi'].to_list()[i]
    batch2[i]['topic_predictions_publications_only']=results2[i]

In [45]:
# Export list to json format
filepath = '/Users/fdp54928/Library/CloudStorage/OneDrive-Nexus365/GitHub Repositories/synchrotron-proposals-topic-classification/Datasets/ESRF/Predictions/Proposals_ESRF_Predictions_publications_only.json'
with open(filepath, 'w') as f:
    json.dump(batch2, f, indent=2)

## Topic prediction with both PDF metadata and publications
These predictions are only pssible for the ESRF once the users have completed the experiments, published based on the results, and informed the ESRF of their publications for the latter to record it.

In [46]:
# Rename the column names
df3 = df_combined.rename(columns={'summary': 'abstract_inverted_index', 'combined_openalex_ids':'referenced_works','subject':'journal_display_name' }, inplace=False)

In [47]:
# Replace all null values in the title and abstract columns with blank string
df3['abstract_inverted_index'].fillna('',inplace=True)
df3['title'].fillna('',inplace=True)

# Create training batch
batch3=df3[['title','abstract_inverted_index','referenced_works','journal_display_name']].to_dict(orient='records')

# Set the input features
for dict in batch3:
    dict['inverted']=False

In [48]:
# Topic classification for proposals with both PDF metadata and publications
results3=[]
for single in batch3:
    result=predictor.full_model_prediction([single])
    results3.append(json.loads(result)[0])

In [51]:
# Add the list of topic predictions as a dict value to the batch
n = len(batch3)
for i in range(n):
    del batch3[i]['referenced_works']  # Remove the 'referenced_works' key to avoid duplication
    batch3[i]['proposal_number']=df3['proposal'].to_list()[i]
    batch3[i]['instrument']=df3['instrument'].to_list()[i]
    batch3[i]['experiment_session_doi']=df3['experiment_session_doi'].to_list()[i]
    batch3[i]['pdf_document_name']=df3['pdf_document_name'].to_list()[i]
    batch3[i]['referenced_works_doi']=df3['referenced_works_doi'].to_list()[i]
    batch3[i]['referenced_works_openalex_ids']=df3['referenced_works_openalex_ids'].to_list()[i]
    batch3[i]['publications_doi']=df3['publications_doi'].to_list()[i]
    batch3[i]['publications_openalex_ids']=df3['publications_openalex_ids'].to_list()[i]
    batch3[i]['combined_openalex_ids']=df3['referenced_works'].to_list()[i]
    batch3[i]['topic_predictions_pdf_metadata_only']=batch[i]['topic_predictions_pdf_metadata_only']
    batch3[i]['topic_predictions_publications_only']=batch2[i]['topic_predictions_publications_only']
    batch3[i]['topic_predictions_combined']=results3[i]

In [52]:
# Export list to json format
filepath = '/Users/fdp54928/Library/CloudStorage/OneDrive-Nexus365/GitHub Repositories/synchrotron-proposals-topic-classification/Datasets/ESRF/Predictions/Proposals_ESRF_Predictions_combined_all.json'
with open(filepath, 'w') as f:
    json.dump(batch3, f, indent=2)