# Notebook to try to get identifiers directly from links that can be used in the Semantic Scholar APIs to get the abstracts of the papers. 

Note: This requires the combined_for_analysis_sensationalism_jargon_categories_domain_labels.csv file to be in the data folder. This file can be copied form the projects google drive folder. 

### Data that can be used to get details of a paper from the semantic scholar API:

URLS: 

- semanticscholar.org
- arxiv.org
- aclweb.org
- acm.org
- biorxiv.org

ID: 

- DOI
- Microsoft Academic Graph
- Association for Computational Linguistics
- PubMed/Medline
- PubMed Central

In [2]:
import pandas as pd

In [3]:
inp_df = pd.read_csv('./data/combined_for_analysis_sensationalism_jargon_categories_domain_labels .csv')
inp_df.head()

Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_social_media,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual
0,0,2018,3,The Bradford pear has created an ecological ni...,Biology,greenvilleonline.com,2,4,0.533427,0.181818,...,False,False,False,False,False,False,False,False,less_than_2,less_than_2
1,1,2018,3,Astronomers say they've discovered evidence of...,Astronomy,aftau.org,46,11,0.511129,0.068966,...,False,False,False,False,False,False,False,False,scientific,scientific
2,2,2018,3,The first detection of 21 cm signal from hydro...,Astronomy,news.mit.edu,1,0,0.412405,0.083333,...,False,False,False,False,False,False,False,False,news,news
3,3,2018,3,Universe's First Stars Detected? Get the Facts.,Astronomy,news.nationalgeographic.com,1,1,0.561694,0.0,...,False,False,False,False,False,False,False,False,news,news
4,4,2018,3,Simple telescope picks up hint of the Universe...,Astronomy,arstechnica.com,3,2,0.492255,0.333333,...,False,False,False,False,False,False,False,False,news,news


### Most common domains

We only want to get the identifiers for the most common domains in the dataset so that we can justify the bias in our sample. Getting the identifiers for all the domains would give us a biased result, but it would be harder to identify/justify the bias. Hence, we will only select the top domains from scientific, news and repo categories.

In [4]:
# Most common labels in scientific:
most_common_scientific = ['nature.com', 'sciencedirect.com', 'sciencedaily.com', 'phys.org', 'bioengineer.org', 'pnas.org', 'onlinelibrary.wiley.com', 'sciencemag.org', 'journals.plos.org', 'cell.com']
# Most common labels in news:
most_common_news = ['eurekalert.org', 'psypost.org', 'theguardian.com', 'bbc.com', 'sciencealert.com', 'sciencenews.org', 'nytimes.com', 'newscientist.com', 'inverse.com', 'livescience.com']
# Most common labels in repo:
most_common_repos = ['ncbi.nlm.nih.gov', 'doi.org', 'frontiersin.org', 'tandfonline.com', 'pubs.acs.org', 'nejm.org', 'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'bmj.com', 'symbiosisonlinepublishing.com']

In [5]:
# Filter the data to only include the most common domains from scientific, news and repo categories
inp_df = inp_df[inp_df['domain'].isin(most_common_scientific + most_common_news + most_common_repos)]
print(f"Number of rows in the filtered data: {inp_df.shape[0]}")
inp_df.head()


Number of rows in the filtered data: 56507


Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_social_media,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual
5,5,2018,3,Firearm Injuries Drop 20 Percent When Gun Owne...,Biology,nejm.org,84,22,0.530595,0.0,...,False,False,False,True,False,False,False,False,repo,repo
7,7,2018,3,An Even-Weirder-Than-Usual Tardigrade Just Tur...,Biology,livescience.com,23,5,0.545099,0.0,...,False,False,True,False,False,False,False,False,news,news
8,8,2018,3,This Woman Is Her Own Twin: What Is Chimerism?,Biology,livescience.com,1,1,0.601028,0.0,...,False,False,True,False,False,False,False,False,news,news
17,17,2018,3,Biodiversity loss raises risk of 'extinction c...,Biology,sciencedaily.com,1,1,0.517565,0.142857,...,False,True,False,False,False,False,False,False,scientific,scientific
19,19,2018,3,Supplementation with probiotics during late pr...,Health,journals.plos.org,8,1,0.482136,0.314286,...,False,True,False,False,False,False,False,False,scientific,scientific


In [6]:
"""
The following types of IDs are supported: # from https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_get_paper

<sha> - a Semantic Scholar ID, e.g. 649def34f8be52c8b66281af98ae884c09aef38b
CorpusId:<id> - a Semantic Scholar numerical ID, e.g. CorpusId:215416146
DOI:<doi> - a Digital Object Identifier, e.g. DOI:10.18653/v1/N18-3011
ARXIV:<id> - arXiv.rg, e.g. ARXIV:2106.15928
MAG:<id> - Microsoft Academic Graph, e.g. MAG:112218234
ACL:<id> - Association for Computational Linguistics, e.g. ACL:W12-3903
PMID:<id> - PubMed/Medline, e.g. PMID:19872477
PMCID:<id> - PubMed Central, e.g. PMCID:2323736
URL:<url> - URL from one of the sites listed below, e.g. URL:https://arxiv.org/abs/2106.15928v1
"""
identifiers_semantic_scholar = {
    'DOI': 'DOI',
    'Microsoft Academic Graph': 'MAG',
    'Association for Computational Linguistics': 'ACL',
    'PubMed / Medline': 'PMID',
    'PubMed Central': 'PMCID',
    'URL': 'URL',
    'arXiv': 'ARXIV',
    'Semantic Scholar': 'sha',
    'Semantic Scholar ID': 'CorpusId',
}


In [7]:
import re

In [8]:
# Get all doi from links
doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+'
doi_patterns = [
            r'10\.\d{4,9}/[-._;()/:\w]+',
            r'doi:\s*(10\.\d{4,9}/[-._;()/:\w]+)',
            r'doi\.org/(10\.\d{4,9}/[-._;()/:\w]+)'
]

def get_doi_from_link(link):
    for pattern in doi_patterns:
        doi = re.search(pattern, link, re.IGNORECASE)
        if doi:
            return doi.group()
    return None

inp_df['doi'] = inp_df['url'].apply(get_doi_from_link)

print(f"Number of DOIs: {inp_df['doi'].notnull().sum()}")
inp_df.head()

Number of DOIs: 6974


Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual,doi
5,5,2018,3,Firearm Injuries Drop 20 Percent When Gun Owne...,Biology,nejm.org,84,22,0.530595,0.0,...,False,False,True,False,False,False,False,repo,repo,10.1056/NEJMc1712773
7,7,2018,3,An Even-Weirder-Than-Usual Tardigrade Just Tur...,Biology,livescience.com,23,5,0.545099,0.0,...,False,True,False,False,False,False,False,news,news,
8,8,2018,3,This Woman Is Her Own Twin: What Is Chimerism?,Biology,livescience.com,1,1,0.601028,0.0,...,False,True,False,False,False,False,False,news,news,
17,17,2018,3,Biodiversity loss raises risk of 'extinction c...,Biology,sciencedaily.com,1,1,0.517565,0.142857,...,True,False,False,False,False,False,False,scientific,scientific,
19,19,2018,3,Supplementation with probiotics during late pr...,Health,journals.plos.org,8,1,0.482136,0.314286,...,True,False,False,False,False,False,False,scientific,scientific,10.1371/journal.pmed.1002507


In [9]:
# get all PMID and PMCID from links
# Help from ChatGPT to get the patterns right and verified (testing with a few examples) with regex101.com

# PMID: example looks like this : https://pubmed.ncbi.nlm.nih.gov/22110403/ where 22110403 is the PMID
pmid_pattern = r'pmid=(\d+)|pubmed(?:\/|%2F)(\d+)' 
# PMCID: example looks like this : https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5793096/#__ffn_sectitle where PMC5793096 is the PMCID
pmcid_pattern = r'https:\/\/www\.ncbi\.nlm\.nih\.gov\/pmc\/articles\/(PMC\d+)\/'

def get_pmid_from_link(link):
    pmid = re.search(pmid_pattern, link, re.IGNORECASE)
    if pmid:
        return pmid.group(1) if pmid.group(1) else pmid.group(2)
    return None

def get_pmcid_from_link(link):
    pmcid = re.search(pmcid_pattern, link, re.IGNORECASE)
    if pmcid:
        return pmcid.group(1)
    return None

inp_df['pmid'] = inp_df['url'].apply(get_pmid_from_link)
inp_df['pmcid'] = inp_df['url'].apply(get_pmcid_from_link)

print(f"Number of PMIDs: {inp_df['pmid'].notnull().sum()}")
print(f"Number of PMCIDs: {inp_df['pmcid'].notnull().sum()}")


Number of PMIDs: 593
Number of PMCIDs: 733


In [10]:
"""
Add new column identifier with the prefixes from `identifiers_semantic_scholar` followed by the appropriate identifier value (doi, pmid, pmcid, etc.)
"""
def get_identifier(row):
    if row['doi']:
        return f"{identifiers_semantic_scholar['DOI']}:{row['doi']}"
    if row['pmid']:
        return f"{identifiers_semantic_scholar['PubMed / Medline']}:{row['pmid']}"
    if row['pmcid']:
        return f"{identifiers_semantic_scholar['PubMed Central']}:{row['pmcid']}"
    return None

inp_df['identifier'] = inp_df.apply(get_identifier, axis=1)


In [11]:
inp_df = inp_df[inp_df['identifier'].notnull()]
print(f"Number of rows with identifiers: {inp_df.shape[0]}")

Number of rows with identifiers: 8299


In [12]:
# Remove the columns that are not needed
inp_df = inp_df.drop(columns=['doi', 'pmid', 'pmcid'])
inp_df.head()

Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual,identifier
5,5,2018,3,Firearm Injuries Drop 20 Percent When Gun Owne...,Biology,nejm.org,84,22,0.530595,0.0,...,False,False,True,False,False,False,False,repo,repo,DOI:10.1056/NEJMc1712773
19,19,2018,3,Supplementation with probiotics during late pr...,Health,journals.plos.org,8,1,0.482136,0.314286,...,True,False,False,False,False,False,False,scientific,scientific,DOI:10.1371/journal.pmed.1002507
25,25,2018,3,Study finds that bee venom could be a useful p...,Medicine,ncbi.nlm.nih.gov,27,9,0.507827,0.333333,...,False,False,True,False,False,False,False,repo,repo,PMCID:PMC5793096
31,31,2018,3,"The interplay of gene flow, population size va...",Biology,onlinelibrary.wiley.com,6,0,0.433548,0.352941,...,True,False,False,False,False,False,False,scientific,scientific,DOI:10.1111/evo.13435/abstract
33,33,2018,3,Undisclosed Conflicts of Interests among Biome...,Social Science,ncbi.nlm.nih.gov,263,21,0.511708,0.086957,...,False,False,True,False,False,False,False,repo,repo,PMID:29400625


In [13]:
inp_df.to_csv('./data/filtered_data_with_identifiers.csv', index=False)
