# Notebook to try to get identifiers directly from links that can be used in the Semantic Scholar APIs to get the abstracts of the papers. 

Note: This requires the combined_for_analysis_sensationalism_jargon_categories_domain_labels.csv file to be in the data folder. This file can be copied form the projects google drive folder. 

### Data that can be used to get details of a paper from the semantic scholar API:

URLS: 

- semanticscholar.org
- arxiv.org
- aclweb.org
- acm.org
- biorxiv.org

ID: 

- DOI
- Microsoft Academic Graph
- Association for Computational Linguistics
- PubMed/Medline
- PubMed Central

In [1]:
import pandas as pd

In [2]:
inp_df = pd.read_csv('../data/combined_for_analysis_sensationalism_jargon_categories_domain_labels .csv')
inp_df.head()

Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_social_media,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual
0,0,2018,3,The Bradford pear has created an ecological ni...,Biology,greenvilleonline.com,2,4,0.533427,0.181818,...,False,False,False,False,False,False,False,False,less_than_2,less_than_2
1,1,2018,3,Astronomers say they've discovered evidence of...,Astronomy,aftau.org,46,11,0.511129,0.068966,...,False,False,False,False,False,False,False,False,scientific,scientific
2,2,2018,3,The first detection of 21 cm signal from hydro...,Astronomy,news.mit.edu,1,0,0.412405,0.083333,...,False,False,False,False,False,False,False,False,news,news
3,3,2018,3,Universe's First Stars Detected? Get the Facts.,Astronomy,news.nationalgeographic.com,1,1,0.561694,0.0,...,False,False,False,False,False,False,False,False,news,news
4,4,2018,3,Simple telescope picks up hint of the Universe...,Astronomy,arstechnica.com,3,2,0.492255,0.333333,...,False,False,False,False,False,False,False,False,news,news


### Most common domains

We only want to get the identifiers for the most common domains in the dataset so that we can justify the bias in our sample. Getting the identifiers for all the domains would give us a biased result, but it would be harder to identify/justify the bias. Hence, we will only select the top domains from scientific, news and repo categories.

In [3]:
# Most common labels in scientific:
most_common_scientific = ['nature.com', 'sciencedirect.com', 'sciencedaily.com', 'phys.org', 'bioengineer.org', 'pnas.org', 'onlinelibrary.wiley.com', 'sciencemag.org', 'journals.plos.org', 'cell.com']
# Most common labels in news:
most_common_news = ['eurekalert.org', 'psypost.org', 'theguardian.com', 'bbc.com', 'sciencealert.com', 'sciencenews.org', 'nytimes.com', 'newscientist.com', 'inverse.com', 'livescience.com']
# Most common labels in repo:
most_common_repos = ['ncbi.nlm.nih.gov', 'doi.org', 'frontiersin.org', 'tandfonline.com', 'pubs.acs.org', 'nejm.org', 'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'bmj.com', 'symbiosisonlinepublishing.com']

In [4]:
# Filter the data to only include the most common domains from scientific, news and repo categories
inp_df = inp_df[inp_df['domain'].isin(most_common_scientific + most_common_news + most_common_repos)]
print(f"Number of rows in the filtered data: {inp_df.shape[0]}")
inp_df.head()


Number of rows in the filtered data: 56507


Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_social_media,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual
5,5,2018,3,Firearm Injuries Drop 20 Percent When Gun Owne...,Biology,nejm.org,84,22,0.530595,0.0,...,False,False,False,True,False,False,False,False,repo,repo
7,7,2018,3,An Even-Weirder-Than-Usual Tardigrade Just Tur...,Biology,livescience.com,23,5,0.545099,0.0,...,False,False,True,False,False,False,False,False,news,news
8,8,2018,3,This Woman Is Her Own Twin: What Is Chimerism?,Biology,livescience.com,1,1,0.601028,0.0,...,False,False,True,False,False,False,False,False,news,news
17,17,2018,3,Biodiversity loss raises risk of 'extinction c...,Biology,sciencedaily.com,1,1,0.517565,0.142857,...,False,True,False,False,False,False,False,False,scientific,scientific
19,19,2018,3,Supplementation with probiotics during late pr...,Health,journals.plos.org,8,1,0.482136,0.314286,...,False,True,False,False,False,False,False,False,scientific,scientific


In [5]:
"""
The following types of IDs are supported: # from https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_get_paper

<sha> - a Semantic Scholar ID, e.g. 649def34f8be52c8b66281af98ae884c09aef38b
CorpusId:<id> - a Semantic Scholar numerical ID, e.g. CorpusId:215416146
DOI:<doi> - a Digital Object Identifier, e.g. DOI:10.18653/v1/N18-3011
ARXIV:<id> - arXiv.rg, e.g. ARXIV:2106.15928
MAG:<id> - Microsoft Academic Graph, e.g. MAG:112218234
ACL:<id> - Association for Computational Linguistics, e.g. ACL:W12-3903
PMID:<id> - PubMed/Medline, e.g. PMID:19872477
PMCID:<id> - PubMed Central, e.g. PMCID:2323736
URL:<url> - URL from one of the sites listed below, e.g. URL:https://arxiv.org/abs/2106.15928v1
"""
identifiers_semantic_scholar = {
    'DOI': 'DOI',
    'Microsoft Academic Graph': 'MAG',
    'Association for Computational Linguistics': 'ACL',
    'PubMed / Medline': 'PMID',
    'PubMed Central': 'PMCID',
    'URL': 'URL',
    'arXiv': 'ARXIV',
    'Semantic Scholar': 'sha',
    'Semantic Scholar ID': 'CorpusId',
}


In [6]:
import re

In [7]:
# Get all doi from links
# Help from ChatGPT to get the patterns right and verified (testing with a few examples) with regex101.com
doi_patterns = [
            r'10\.\d{4,9}/[-._;()/:A-Z0-9]+' ,
            r'10\.\d{4,9}/[-._;()/:\w]+',
            r'doi:\s*(10\.\d{4,9}/[-._;()/:\w]+)',
            r'doi\.org/(10\.\d{4,9}/[-._;()/:\w]+)'
]

def get_doi_from_link(link):
    for pattern in doi_patterns:
        doi = re.search(pattern, link, re.IGNORECASE)
        if doi:
            doi_group = doi.group()
            if doi_group.endswith('/full'):
                return doi_group[:-5]
            return doi_group
    return None

inp_df['doi'] = inp_df['url'].apply(get_doi_from_link)

print(f"Number of DOIs: {inp_df['doi'].notnull().sum()}")
inp_df.head()

Number of DOIs: 6974


Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual,doi
5,5,2018,3,Firearm Injuries Drop 20 Percent When Gun Owne...,Biology,nejm.org,84,22,0.530595,0.0,...,False,False,True,False,False,False,False,repo,repo,10.1056/NEJMc1712773
7,7,2018,3,An Even-Weirder-Than-Usual Tardigrade Just Tur...,Biology,livescience.com,23,5,0.545099,0.0,...,False,True,False,False,False,False,False,news,news,
8,8,2018,3,This Woman Is Her Own Twin: What Is Chimerism?,Biology,livescience.com,1,1,0.601028,0.0,...,False,True,False,False,False,False,False,news,news,
17,17,2018,3,Biodiversity loss raises risk of 'extinction c...,Biology,sciencedaily.com,1,1,0.517565,0.142857,...,True,False,False,False,False,False,False,scientific,scientific,
19,19,2018,3,Supplementation with probiotics during late pr...,Health,journals.plos.org,8,1,0.482136,0.314286,...,True,False,False,False,False,False,False,scientific,scientific,10.1371/journal.pmed.1002507


In [8]:
# get all PMID and PMCID from links
# Help from ChatGPT to get the patterns right and verified (testing with a few examples) with regex101.com

# PMID: example looks like this : https://pubmed.ncbi.nlm.nih.gov/22110403/ where 22110403 is the PMID
# it can also look like this: https://pubmed.ncbi.nlm.nih.gov/35964870/
pmid_pattern = r'pmid=(\d+)|pubmed(?:\/|%2F)(\d+)|pubmed\.ncbi\.nlm\.nih\.gov\/(\d+)\/?' 
# PMCID: example looks like this : https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5793096/#__ffn_sectitle where PMC5793096 is the PMCID
pmcid_pattern = r'https?:\/\/www\.ncbi\.nlm\.nih\.gov\/(?:labs\/)?pmc\/articles\/(PMC\d+)\/?'

def get_pmid_from_link(link):
    pmid = re.search(pmid_pattern, link, re.IGNORECASE)
    if pmid:
        return next(filter(None, pmid.groups()))
    return None

def get_pmcid_from_link(link):
    pmcid = re.search(pmcid_pattern, link, re.IGNORECASE)
    if pmcid:
        full_pmcid = pmcid.group(1)
        return full_pmcid[3:] if full_pmcid.startswith('PMC') else full_pmcid
    return None

inp_df['pmid'] = inp_df['url'].apply(get_pmid_from_link)
inp_df['pmcid'] = inp_df['url'].apply(get_pmcid_from_link)

print(f"Number of PMIDs: {inp_df['pmid'].notnull().sum()}")
print(f"Number of PMCIDs: {inp_df['pmcid'].notnull().sum()}")


Number of PMIDs: 1109
Number of PMCIDs: 789


In [9]:
# Get all ARXIV identifiers from links, if the domain is arxiv.org, then add a column called arxiv that has the url value
inp_df['arxiv'] = inp_df[inp_df['domain'] == 'arxiv.org']['url']

print(f"Number of ARXIVs: {inp_df['arxiv'].notnull().sum()}")

Number of ARXIVs: 483


In [10]:
"""
Add new column identifier with the prefixes from `identifiers_semantic_scholar` followed by the appropriate identifier value (doi, pmid, pmcid, etc.)
"""
def get_identifier(row):
    if row['doi']:
        return f"{identifiers_semantic_scholar['DOI']}:{row['doi']}"
    if row['pmid']:
        return f"{identifiers_semantic_scholar['PubMed / Medline']}:{row['pmid']}"
    if row['pmcid']:
        return f"{identifiers_semantic_scholar['PubMed Central']}:{row['pmcid']}"
    if row['arxiv']:
        if pd.notna(row['arxiv']):
            return f"{identifiers_semantic_scholar['URL']}:{row['arxiv']}"
    return None

inp_df['identifier'] = inp_df.apply(get_identifier, axis=1)


In [11]:
out_pf = inp_df[inp_df['identifier'].notnull()]
print(f"Number of rows with identifiers: {out_pf.shape[0]}")

Number of rows with identifiers: 9354


In [12]:
# Remove the columns that are not needed
out_pf = out_pf.drop(columns=['doi', 'pmid', 'pmcid', 'arxiv'])
out_pf.head()

Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,...,is_top_domain_scientific,is_top_domain_news,is_top_domain_repo,is_top_domain_scam,is_top_domain_unknown,is_top_domain_indecisive,is_top_domain_less_than_2,label_voting_lm,label_voting_manual,identifier
5,5,2018,3,Firearm Injuries Drop 20 Percent When Gun Owne...,Biology,nejm.org,84,22,0.530595,0.0,...,False,False,True,False,False,False,False,repo,repo,DOI:10.1056/NEJMc1712773
19,19,2018,3,Supplementation with probiotics during late pr...,Health,journals.plos.org,8,1,0.482136,0.314286,...,True,False,False,False,False,False,False,scientific,scientific,DOI:10.1371/journal.pmed.1002507
25,25,2018,3,Study finds that bee venom could be a useful p...,Medicine,ncbi.nlm.nih.gov,27,9,0.507827,0.333333,...,False,False,True,False,False,False,False,repo,repo,PMCID:5793096
31,31,2018,3,"The interplay of gene flow, population size va...",Biology,onlinelibrary.wiley.com,6,0,0.433548,0.352941,...,True,False,False,False,False,False,False,scientific,scientific,DOI:10.1111/evo.13435/abstract
33,33,2018,3,Undisclosed Conflicts of Interests among Biome...,Social Science,ncbi.nlm.nih.gov,263,21,0.511708,0.086957,...,False,False,True,False,False,False,False,repo,repo,PMID:29400625


In [13]:
out_pf.to_csv('../data/filtered_data_with_identifiers.csv', index=False)

In [14]:
inp_df.shape

(56507, 27)

In [15]:
out_pf.shape

(9354, 23)

In [16]:
# print the rows that are in inp_df but not in out_pf that have label_voting_manual == repo
remainders_df = inp_df[~inp_df['url'].isin(out_pf['url']) & (inp_df['label_voting_manual'] == 'repo')]
# drop unnecessary columns
remainders_df = remainders_df.drop(columns=['doi', 'pmid', 'pmcid', 'arxiv'])
# drop all is_top_domain rows
remainders_df = remainders_df.drop(columns=['is_top_domain_news', 'is_top_domain_repo', 'is_top_domain_scientific', 'is_top_domain_scam', 'is_top_domain_unknown', 'is_top_domain_indecisive', 'is_top_domain_less_than_2'])
remainders_df

Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,url,id,is_top_domain_social_media,label_voting_lm,label_voting_manual,identifier
441,441,2018,3,Neuromyelitis Optica Spectrum Disorder as a Pa...,Health,symbiosisonlinepublishing.com,1,0,0.394180,0.000000,https://symbiosisonlinepublishing.com/neurolog...,82vivw,False,repo,repo,
442,442,2018,3,Central Post-stroke Pain and Pharmacological T...,Health,symbiosisonlinepublishing.com,1,0,0.389311,0.000000,https://symbiosisonlinepublishing.com/neurolog...,82viy3,False,repo,repo,
443,443,2018,3,Atypical McArdle’s disease with asymmetric wea...,Health,symbiosisonlinepublishing.com,1,0,0.413239,0.125000,https://symbiosisonlinepublishing.com/neurolog...,82vizn,False,repo,repo,
444,444,2018,3,Dynamic F-Wave Study Aids in Diagnosis of Neur...,Health,symbiosisonlinepublishing.com,1,0,0.385285,0.000000,https://symbiosisonlinepublishing.com/neurolog...,82vj1d,False,repo,repo,
445,445,2018,3,Meningioma: The Unusual Growth in a Transsexua...,Health,symbiosisonlinepublishing.com,1,0,0.461184,0.090909,https://symbiosisonlinepublishing.com/neurolog...,82vj33,False,repo,repo,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195651,195651,2019,10,The production of unnecessary infant &amp; tod...,Environment,bmj.com,3,6,0.582619,0.205128,https://www.bmj.com/content/367/bmj.l5646,dcyrm2,False,repo,repo,
195730,195730,2019,10,The production of unnecessary infant and toddl...,Environment,bmj.com,30,38,0.582105,0.205128,https://www.bmj.com/content/367/bmj.l5646,ddcd5u,False,repo,repo,
195846,195846,2019,10,Support for breastfeeding is an environmental ...,Environment,bmj.com,1,0,0.532830,0.428571,https://www.bmj.com/content/367/bmj.l5646,deaq7v,False,repo,repo,
195951,195951,2019,10,Current knowledge is insufficient to determine...,Health,bmj.com,8,4,0.470955,0.238095,https://www.bmj.com/content/366/bmj.l5275.long,dewj3w,False,repo,repo,


In [17]:
# get the count of the domains in remainders_df
remainders_df['domain'].value_counts()

domain
symbiosisonlinepublishing.com    385
bmj.com                          379
ncbi.nlm.nih.gov                  21
doi.org                           16
pubs.acs.org                       4
frontiersin.org                    4
nejm.org                           3
tandfonline.com                    2
pubmed.ncbi.nlm.nih.gov            2
Name: count, dtype: int64

In [18]:
remainders_df[remainders_df['domain'] == ('doi.org')]

Unnamed: 0.1,Unnamed: 0,year,month,title,link_flair_text,domain,score,num_comments,sensationalism_score,jargon_proportion,url,id,is_top_domain_social_media,label_voting_lm,label_voting_manual,identifier
6694,6694,2021,6,Autistic adults’ experiences of diagnostic dis...,Psychology,doi.org,144,17,0.451307,0.133333,https://doi.org/10.1177%2F23969415211022955,nsw60m,False,repo,repo,
8507,8507,2021,6,Wikipedia articles about women (who meet Wikip...,Social Science,doi.org,45,7,0.516609,0.0,https://doi.org/10.1177%2F14614448211023772,o9mr00,False,repo,repo,
21866,21866,2020,11,"In pet rabbits more than 6 years old, the prev...",Animal Science,doi.org,42,0,0.474504,0.098039,https://doi.org/10.1177%2F0300985820973460,jxnmxw,False,repo,repo,
36360,36360,2021,5,Defining and Measuring Meditation-Related Adve...,Psychology,doi.org,9,7,0.434813,0.0,https://doi.org/10.1177%2F2167702621996340,nn4csj,False,repo,repo,
63365,63365,2020,8,Journalists’ Twitter use shows them talking wi...,Social Science,doi.org,517,81,0.567829,0.114286,https://doi.org/10.1177%2F2056305120926639,i4tcau,False,repo,repo,
88221,88221,2020,7,Fighting COVID-19 Misinformation on Social Med...,Psychology,doi.org,32,12,0.609591,0.0,https://doi.org/10.1177%2F0956797620939054,hl5a5z,False,repo,repo,
92983,92983,2018,9,Does Rejection Still Hurt? Examining the Effec...,Social Science,doi.org,1,1,0.543843,0.0,https://doi.org/10.1177%2F0894439318795128,9je2m0,False,repo,repo,
92984,92984,2018,9,Examining the Effects of Network Attention and...,Social Science,doi.org,10,4,0.415058,0.0,https://doi.org/10.1177%2F0894439318795128,9je3en,False,repo,repo,
116422,116422,2019,6,"Some patients feel shame, anxiety or fear imme...",Medicine,doi.org,24,18,0.593809,0.157895,https://doi.org/10.1177%2F0146167219855042,c4pf26,False,repo,repo,
120563,120563,2021,8,New study finds a high need for uniqueness amo...,Psychology,doi.org,34,26,0.61821,0.170732,https://doi.org/10.1177%2F19485506211031082,p6e8wi,False,repo,repo,


In [19]:
# how many are label_voting_manual == repo
print(out_pf['label_voting_manual'].value_counts())

# in the original file: 
print(inp_df['label_voting_manual'].value_counts())

label_voting_manual
repo          5909
scientific    3445
Name: count, dtype: int64
label_voting_manual
scientific    27132
news          22650
repo           6725
Name: count, dtype: int64


In [20]:
# filter the data so we only have label_voting_manual == repo and write to csv
out_pf[out_pf['label_voting_manual'] == 'repo'].to_csv('../data/filtered_data_with_identifiers_repo.csv', index=False)
