In [126]:
import numpy as np
import pandas as pd
import requests

from pyalex import Works, config
config.email = 'terence.tan@wadham.ox.ac.uk'

# Exploring the results

Now that we have produced topic classifications for the experimental proposals, we explore the results. \
The first thing we want to do is to compare our topic predictions with the topics for the referenced works.

In [128]:
prop_esrf_pred=pd.read_json('/Users/fdp54928/Library/CloudStorage/OneDrive-Nexus365/GitHub Repositories/synchrotron-proposals-topic-classification/Datasets/ESRF/Proposals_ESRF_Predictions')

## Compare our topic predictions with the topics of the referenced works

In [129]:
# Get a list of openalex IDs of all the referenced works
openalex_ids=prop_esrf_pred[prop_esrf_pred['has referenced works']==1]['referenced_works'].to_list()

# Get the list of our topic predictions for the proposals that have referenced works
topics_pred=prop_esrf_pred[prop_esrf_pred['has referenced works']==1]['topic predictions'].to_list()


In [None]:
# Get the topics for referenced works

n = len(openalex_ids)
topics_list=[None]*n

for i in range(n):
    topic_list=[]
    ids=openalex_ids[i]
    m = len(ids)
    for j in range(m):
        id=ids[j]
        result=Works()[id.split('/')[-1]]['topics']
        topic_list.append(result)
    topics_list[i]=topic_list

We check how many of our primary topic predictions match the topics of the referenced works. This will tell us how much weight the title/abstract/journal name information has in the model.

In [130]:
# Count the number of our primary topic predictions that matches at least one of the primary topics of the referenced works

n=len(topics_pred)
count=0

for i in range(n):
    topic_pred=str(topics_pred[i][0]['topic_id'])
    for topics in topics_list[i]:
        topic_pub=topics[0]['id'].split('T')[-1]        # Get the OpenAlex URL of the primary topic, extract the OpenAlex ID from the URL
        if topic_pred==topic_pub:
            count+=1
            break
        else:
            pass

print('Number of primary topic predictions that matches the primary topics of at least one of the referenced works:', count, 'out of', n)

Number of primary topic predictions that matches the primary topics of at least one of the referenced works: 202 out of 534


In [131]:
# Count the number of our primary topic predictions that matches at least one of the topics of any of the referenced works

def check_matches(topic_pred,topics):
    for topic in topics:
        topic_pub=topic['id'].split('T')[-1]        # Get the OpenAlex URL of the primary topic, extract the OpenAlex ID from the URL
        if topic_pred==topic_pub:
            return 1
        else:
            pass
    return 0


n=len(topics_pred)
count=0

for i in range(n):
    topic_pred=str(topics_pred[i][0]['topic_id'])
    for topics in topics_list[i]:
        if check_matches(topic_pred,topics)==1:
            count+=1
            break
        else:
            pass

print('Number of primary topic predictions that matches any of the top 3 topics of at least one of the referenced works:', count, 'out of', n)

Number of primary topic predictions that matches any of the top 3 topics of at least one of the referenced works: 282 out of 534


Now we check if any one of our predictions matches any one of the topics of the referenced works.

In [132]:
def check_any_matches(topic_pred_any,topics):
    for topic in topics:
        topic_pub=topic['id'].split('T')[-1]        # Get the OpenAlex URL of the primary topic, extract the OpenAlex ID from the URL
        if topic_pub in topic_pred_any:
            return 1
        else:
            pass
    return 0


n=len(topics_pred)
count=0

for i in range(n):
    topic_pred_any=[]
    for topic_pred in topics_pred[i]:
        topic_pred_any.append(str(topic_pred['topic_id']))
    for topics in topics_list[i]:
        if check_any_matches(topic_pred_any,topics)==1:
            count+=1
            break
        else:
            pass

print('Number of times any one of our topic predictions matches any of the topics of the referenced works:', count, 'out of', n)


Number of times any one of our topic predictions matches any of the topics of the referenced works: 350 out of 534


In [None]:
def fetch_data(doi):
    api_request="https://icatplus.esrf.fr/doi/"
    response = requests.get(api_request+doi+'/reports')

    # Check for valid API call status code
    if response.status_code==200:
        reports_esrf=response.json()
        report_esrf=reports_esrf[0]
        session['proposal']=report_esrf['proposal']
        return session     # prop with proposal number added 
    else:
        return 0        # return 0 is API call result is invalid

In [135]:
prop_esrf_pred['Experiment session DOI'].to_list()

[['10.15151/ESRF-ES-670011338'],
 ['10.15151/ESRF-ES-745262790'],
 ['10.15151/ESRF-ES-1106933962'],
 ['10.15151/ESRF-ES-1361431177', '10.15151/ESRF-ES-1415102792'],
 ['10.15151/ESRF-ES-1578692758'],
 ['10.15151/ESRF-ES-1138700094'],
 ['10.15151/ESRF-ES-1173445226', '10.15151/ESRF-ES-1269292736'],
 ['10.15151/ESRF-ES-788657847'],
 ['10.15151/ESRF-ES-697010253'],
 ['10.15151/ESRF-ES-1186974744'],
 ['10.15151/ESRF-ES-1083184488'],
 ['10.15151/ESRF-ES-938664774'],
 ['10.15151/ESRF-ES-991603319', '10.15151/ESRF-ES-955503876'],
 ['10.15151/ESRF-ES-1202136798'],
 ['10.15151/ESRF-ES-972238181'],
 ['10.15151/ESRF-ES-1352731648'],
 ['10.15151/ESRF-ES-945455342'],
 ['10.15151/ESRF-ES-1024537748'],
 ['10.15151/ESRF-ES-953229928'],
 ['10.15151/ESRF-ES-1028113854'],
 ['10.15151/ESRF-ES-1108936877'],
 ['10.15151/ESRF-ES-1192345315', '10.15151/ESRF-ES-1365446575'],
 ['10.15151/ESRF-ES-1081709123'],
 ['10.15151/ESRF-ES-1343996281'],
 ['10.15151/ESRF-ES-695262570'],
 ['10.15151/ESRF-ES-946449872'],
 ['1

In [139]:
api_request="https://icatplus.esrf.fr/doi/"
response = requests.get(api_request+'10.15151/ESRF-ES-144554304'+'/reports')

In [140]:
response.json()

[{'proposal': 'MD-1181',
  'underEmbargo': False,
  'nbReports': 1,
  'reports': ['90573_B.pdf']}]