In [1]:
import json
import os
from tqdm import tqdm

In [2]:
pdfs = os.listdir('document_parses/pdf_json')
pmcs = os.listdir('document_parses/pmc_json')

len(pdfs), len(pmcs)

(401212, 315742)

In [36]:
paper_entries = {}
for pdf in tqdm(pdfs):
    with open(f'document_parses/pdf_json/{pdf}') as f:
        data = json.load(f)
        title = data['metadata']['title']
        refs = data['bib_entries']
        ref_titles = [refs[ref]['title'] for ref in refs]

        paper_entries[title] = ref_titles

for pmc in tqdm(pmcs):
    with open(f'document_parses/pmc_json/{pmc}') as f:
        data = json.load(f)
        title = data['metadata']['title']
        refs = data['bib_entries']
        ref_titles = [refs[ref]['title'] for ref in refs]

        paper_entries[title] = ref_titles
# write to file
with open('paper_entries.json', 'w') as f:
    json.dump(paper_entries, f)



100%|██████████| 401212/401212 [1:09:01<00:00, 96.88it/s] 
100%|██████████| 315742/315742 [50:27<00:00, 104.30it/s] 


In [2]:
# load the json file
with open('paper_entries.json') as f:
    paper_entries = json.load(f)

In [4]:
paper_entries = list(paper_entries.items())


In [20]:
# remove the spaces and make it lowercase of the 2nd element of the tuple
def clean_title(title):
    return title.replace(' ', '').lower().replace(':', '').replace('.', '').replace(',', '').replace(';', '').replace('(', '').replace(')', '').replace('?', '').replace('!', '').replace('-', '').replace('/', '').replace('\\', '').replace('\'', '').replace('\"', '').replace('*','')

paper_entries = [(clean_title(title), refs) for title, refs in paper_entries]


In [23]:
query = 'networkreconstructionandcommunitydetectionfromdynamics'

# search the query in the titles
def search(query):
    query = clean_title(query)
    results = []
    for title, refs in paper_entries:
        if query in title:
            results.append((title, refs))
    return results

results = search(query)
len(results), results[0][0]

(1, 'networkreconstructionandcommunitydetectionfromdynamics')

### Sentiment analysis

In [32]:
from transformers import pipeline

# Load a sentiment-analysis pipeline
classifier = pipeline('sentiment-analysis', model= 'distilbert-base-uncased-finetuned-sst-2-english')

# Example texts
texts = [
    "I love this product! It works wonderfully.",
    "This is the worst experience I've ever had."
]

# Classify sentiment
results = classifier(texts)

# Display results
for text, result in zip(texts, results):
    print(f"Text: {text}\nSentiment: {result['label']}, Confidence: {result['score']:.2f}\n")


Text: I love this product! It works wonderfully.
Sentiment: POSITIVE, Confidence: 1.00

Text: This is the worst experience I've ever had.
Sentiment: NEGATIVE, Confidence: 1.00



In [33]:
# text = ' this is a neural statement'
# results = classifier(text)
results

[{'label': 'POSITIVE', 'score': 0.9998823404312134},
 {'label': 'NEGATIVE', 'score': 0.9997679591178894}]

In [30]:
from transformers import pipeline

# Load the multilingual sentiment model
classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

# Example texts
texts = [
    "I absolutely love this!",
    "This is terrible.",
    "The product was okay, not great."
]

# Classify sentiment and normalize the scores to the [0, 1] range
results = classifier(texts)

# Normalize from 1-5 scale to 0-1 scale (subtract 1 and divide by 4)
def normalize_sentiment(score):
    return (score - 1) / 4

# Display results with normalized sentiment values
for text, result in zip(texts, results):
    label = result['label']  # e.g., '1 star', '5 stars'
    stars = int(label.split()[0])  # Extract star rating
    normalized_score = normalize_sentiment(stars)  # Convert to 0-1 scale
    print(f"Text: {text}\nSentiment (0-1): {normalized_score:.2f}, Original label: {label}\n")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Text: I absolutely love this!
Sentiment (0-1): 1.00, Original label: 5 stars

Text: This is terrible.
Sentiment (0-1): 0.00, Original label: 1 star

Text: The product was okay, not great.
Sentiment (0-1): 0.50, Original label: 3 stars



In [31]:
results

[{'label': '5 stars', 'score': 0.9676192998886108},
 {'label': '1 star', 'score': 0.930989146232605},
 {'label': '3 stars', 'score': 0.7260130643844604}]

In [34]:
from transformers import pipeline

# Load the multilingual sentiment model
classifier = pipeline('sentiment-analysis', model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")

# Example texts
texts = [
    "I absolutely love this!",
    "This is terrible.",
    "The product was okay, not great."
]

# Classify sentiment and normalize the scores to the [0, 1] range
results = classifier(texts)



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [37]:
text = 'Mental health and physical health should not be considered independently in competitive sports: mental health issues can influence performance, increase the risk of physical injuries and prolong rehabilitation. Injuries, in turn, may affect performance and constitute stress and risk for mental health [1] . Worry, anxiety and psychological stress are also risks to mental health and are associated with the occurrence of psychiatric disorders. Additionally, chronic stress plays an important role in the pathogenesis of anxiety and depression [18] . The COVID-19 pandemic is in itself a stressful situation (fear of getting infected or losing loved ones). Together with the impact of lockdown measures on everyday life it poses a risk to mental health [19] .'
results = classifier(text)
results

[{'label': 'negative', 'score': 0.6789236068725586}]

In [None]:
# get the first entry
queries = paper_entries[list(paper_entries.keys())[45]]
for query in queries:
    # search for the query in the titles
    for title in paper_entries.keys():
        if query in title:
            print(title)


In [7]:
# As this is a lot of data, we need to take it in chunks of 10000
# load the json file
KG = ''
with open('paper_entries.json') as f:
    paper_entries = json.load(f)
    # iterate over the keys
    with open('KG.txt', 'a') as f:
        for key in tqdm(paper_entries.keys()[:10000]):
            for i, query in enumerate(paper_entries[key]):
                f.write(f'"{key}","BIBREF{i}","{query}"\n')
        

# Done till: 0   <--- Update this number to the last number you did



  1%|          | 3196/504801 [30:06<78:46:12,  1.77it/s] 


KeyboardInterrupt: 

In [9]:
with open('KG.txt', 'w') as f:
    f.write(KG)

## Starting the Topics Knowledge graph.

In [3]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

In [39]:
title_to_paper_file = './Data/Data/title_to_paper_map.json'
topic_nodes_file = './Data/Data/nodes/topic_nodes.csv'
paper_to_topic_file = './Data/Data/edges/paper_to_topic.csv'

# Load the title to paper map
with open(title_to_paper_file) as f:
    title_to_paper = json.load(f)

# convert the json to a pandas dataframe
title_to_paper = pd.DataFrame(title_to_paper.items(), columns=['title', 'paper_id'])

# Load the topic nodes
topic_nodes = pd.read_csv(topic_nodes_file)

# Load the paper to topic edges
paper_to_topic = pd.read_csv(paper_to_topic_file)

In [43]:
paper_to_topic.head()

Unnamed: 0,~id,~label,~from,~to,score:Float
0,c5a84fb7-7e2b-49f2-b744-ed841595b5dc,associated_topic,b2d29ec4-caa3-438a-897c-445f5b8c1314,ed0fbd02-ccff-4e13-8381-e44708494382,1.0
1,7bbf3f41-d087-48ee-b5bd-9ef748a8f5db,associated_topic,01c20e5d-f3e2-42bf-bd58-bbac101316d0,009368ed-d4d2-4b87-908d-42529c39dbf3,0.954
2,a1ff3f90-cbcc-4ef6-a5fa-3fe28365ece2,associated_topic,d42ae2ea-559d-4832-895a-9c46bb00f3ba,ddc3abdc-82ad-4c4b-b87c-07f54d8d5dc9,0.9927
3,109c624f-a48d-4eb1-bcae-e247c4f84646,associated_topic,d42ae2ea-559d-4832-895a-9c46bb00f3ba,d494d90a-b28d-4e93-a74a-fa35be21e5d3,0.8872
4,6921f83a-050d-44f1-b5ad-ec5cdb8fb7c2,associated_topic,d42ae2ea-559d-4832-895a-9c46bb00f3ba,91c0d427-82bd-433a-8859-7b3e2cfebf4c,0.6584


In [46]:
# Create a knowledge graph with triplets connecting the paper titles to the topics
knowledge_graph = []

# Iterate over the paper_to_topic dataframe to create triplets
for _, row in tqdm(paper_to_topic.iterrows()):
    paper_id = row['~from']
    topic_id = row['~to']
    score = row['score:Float']
    
    # Get the paper title
    # Check if there are any matching rows
    matching_rows = title_to_paper[title_to_paper['paper_id'] == paper_id]['title'].values

    if len(matching_rows) > 0:
        paper_title = matching_rows[0]
    else:
        continue
        
    # Get the topic name
    topic_name = topic_nodes[topic_nodes['~id'] == topic_id]['topic:String'].values[0]
    
    # Create a triplet
    triplet = (paper_title, f'associated_with score: {score}', topic_name)
    knowledge_graph.append(triplet)

# Display the first 10 triplets
for triplet in knowledge_graph[:10]:
    print(triplet)

131508it [08:39, 252.93it/s]

('atrans-splicedleadersequenceonactinmrnaincelegans', 'associated_with score: 1.0', 'genomics')
('chapter100angiotensin-convertingenzyme-2', 'associated_with score: 0.954', 'epidemiology')
('technoeconomicmodelingofplant-basedgriffithsinmanufacturing', 'associated_with score: 0.9927', 'public-health-policies')
('technoeconomicmodelingofplant-basedgriffithsinmanufacturing', 'associated_with score: 0.8872', 'lab-trials-human')
('technoeconomicmodelingofplant-basedgriffithsinmanufacturing', 'associated_with score: 0.6584', 'healthcare-industry')
('thecovid-19pandemicimplicationsforthecytologylaboratory', 'associated_with score: 0.9988', 'public-health-policies')
('thecovid-19pandemicimplicationsforthecytologylaboratory', 'associated_with score: 0.9399', 'epidemiology')
('extendedstorageofsars-cov2nasopharyngealswabsdoesnotnegativelyimpactresultsofmolecular-basedtesting', 'associated_with score: 1.0', 'lab-trials-human')
('extendedstorageofsars-cov2nasopharyngealswabsdoesnotnegativelyimpac




In [47]:
len(knowledge_graph)

129530

In [48]:
# Write the knowledge graph to a txt file called KG_topic
with open('KG_topic.txt', 'w') as f:
    for triplet in knowledge_graph:
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')

### Country and Institution Knowledge graph

In [37]:
title_to_paper_file = './Data/Data/title_to_paper_map.json'
institution_nodes_file = './Data/Data/nodes/institution_nodes.csv'
paper_to_author_file = './Data/Data/edges/paper_to_author.csv'
author_to_institution_file = './Data/Data/edges/author_to_institution.csv'

# Load the title to paper map
with open(title_to_paper_file) as f:
    title_to_paper = json.load(f)

# convert the json to a pandas dataframe
title_to_paper = pd.DataFrame(title_to_paper.items(), columns=['title', 'paper_id'])

# Load the institution nodes
institution_nodes = pd.read_csv(institution_nodes_file)

# Load the paper to author edges
paper_to_author = pd.read_csv(paper_to_author_file)

# Load the author to institution edges
author_to_institution = pd.read_csv(author_to_institution_file)

In [38]:
print(len(institution_nodes))
institution_nodes.head()

29932


Unnamed: 0,~id,~label,institution:String,address:String,country:String,post_code:String,settlement:String,publication_count:Int
0,184966b9-c04c-4df5-ae54-ffde72a56202,Institution,Mawlana Bhashani Science and Technology Univer...,,Bangladesh,,Tangail,8
1,d7c2de28-c171-4cd6-86d1-1fe957ebabb9,Institution,Khulna University,,Bangladesh,,Khulna,10
2,25d0debf-1456-47da-ad30-03fca5e53670,Institution,University of Dhaka,,Bangladesh,,Dhaka,22
3,b657c9b8-8405-4d6a-b09b-bd04e6e52763,Institution,,,Bangladesh,,Dhaka,1
4,a7c0830f-8dd6-42c1-88e0-53d6d6344f48,Institution,Rothman Orthopaedic Institute at Thomas Jeffer...,,,,Philadelphia,3


In [31]:
print(len(paper_to_author))
paper_to_author.head()

340789


Unnamed: 0,~id,~label,~from,~to
0,bf5dab20-5552-4305-afce-ca3cdc286840,authored_by,a7a37b88-3851-4af2-b624-ead921eb37da,f9ddd3b1-ce1d-4f09-a462-dfb4d0bc3d14
1,df36b559-d9c6-4d3f-ab40-ed6c640d9650,authored_by,1bad150b-32bd-46e4-8d56-b3097285819d,23061148-48ab-43e8-bf36-522ae98fc0e1
2,49ad106d-be04-4fc9-9f2e-4b0d195347bd,authored_by,520010ce-6cfe-4d1d-85c9-006ce29289ef,3f572202-06e9-42f5-b002-e899d36dad4e
3,91601a14-34d4-4e23-a664-a2ecad5c98db,authored_by,95267dc1-8d53-4fd5-bc2f-ac809f0cbcdd,4b35351a-41fd-416b-bd62-7846e6219564
4,0ba81299-2353-4048-b62d-b8399a50c1fa,authored_by,cc6cd651-d5eb-49f3-a497-22f0c85fb960,fdea87e2-42e3-43bb-bf3c-9ea35facfae5


In [32]:
print(len(author_to_institution))
author_to_institution.head()

164136


Unnamed: 0,~id,~label,~from,~to
0,8dd10a68-7b90-43b1-8644-d656d404fd97,affiliated_with,1d798c5d-dedf-405f-9246-eed1cf38ed6e,53f7b823-5bae-4dc2-bd71-f647309b191f
1,2ded134a-6365-4282-8343-25c995a88129,affiliated_with,1d304ff5-6f22-40e7-83c4-a40cc6b40af2,483b277e-323a-4ddc-bba7-ec3675b73d25
2,5a87b0c9-325e-44a3-932d-fc74b7f78d56,affiliated_with,dd4d756c-d58e-48ab-9e0a-4198f5b63f65,137b6494-457a-4ed8-bc24-3b5c7b6ce367
3,b566b461-5e07-4d05-b1f5-b20cc579b923,affiliated_with,ab05d4a7-c9f2-41cd-94fd-37630b7b159e,56dbc969-3abc-487a-ac8d-2cfc5a024a54
4,22a34a5d-2c82-437d-be06-fd7dbf27d2c4,affiliated_with,e4a37fbb-637e-407b-8589-98a6b8b8603d,811035d4-7799-408e-96fc-217abb7922b6


In [40]:
# give me all the rows where '~to' of paper_to_author is equal to '~from' of author_to_institution

paper_to_institution= pd.merge(paper_to_author, author_to_institution, left_on='~to', right_on='~from')
print(len(paper_to_institution))
paper_to_institution.head()

482734


Unnamed: 0,~id_x,~label_x,~from_x,~to_x,~id_y,~label_y,~from_y,~to_y
0,bf5dab20-5552-4305-afce-ca3cdc286840,authored_by,a7a37b88-3851-4af2-b624-ead921eb37da,f9ddd3b1-ce1d-4f09-a462-dfb4d0bc3d14,02fda3d8-5e86-4ba1-8782-8963847529d8,affiliated_with,f9ddd3b1-ce1d-4f09-a462-dfb4d0bc3d14,5f649689-473b-4e1b-93c3-5757cf0f75a1
1,df36b559-d9c6-4d3f-ab40-ed6c640d9650,authored_by,1bad150b-32bd-46e4-8d56-b3097285819d,23061148-48ab-43e8-bf36-522ae98fc0e1,b7ae4b77-f85b-4b66-b06b-8d66db5fa394,affiliated_with,23061148-48ab-43e8-bf36-522ae98fc0e1,71e38b36-b115-45be-8e02-ece8defec621
2,49ad106d-be04-4fc9-9f2e-4b0d195347bd,authored_by,520010ce-6cfe-4d1d-85c9-006ce29289ef,3f572202-06e9-42f5-b002-e899d36dad4e,a23f144e-5152-4f2c-ba00-7e7963c757d1,affiliated_with,3f572202-06e9-42f5-b002-e899d36dad4e,aa24f219-ca4f-44d6-987a-bb3d78da9732
3,18f5e393-95ad-4059-979d-bab07e730df8,authored_by,52efaa04-db11-49d4-b543-b8b29f7a9f7e,40267fbd-05ce-4760-b074-406e118f6e5e,fda0e3c1-f19a-41c1-8105-d076e19fc8fb,affiliated_with,40267fbd-05ce-4760-b074-406e118f6e5e,4d98477b-5ec2-4aef-b903-faa4d0e229f9
4,18f5e393-95ad-4059-979d-bab07e730df8,authored_by,52efaa04-db11-49d4-b543-b8b29f7a9f7e,40267fbd-05ce-4760-b074-406e118f6e5e,ca580ab4-9d9c-4ad3-b108-7f93de2b660a,affiliated_with,40267fbd-05ce-4760-b074-406e118f6e5e,c4ffabb7-f2a8-4015-ac28-d519d1fe39cf


In [41]:
# keep only the '~from_x' and '~to_y' columns. Chnage the names to 'paper_id' and 'institution_id'
paper_to_institution = paper_to_institution[['~from_x', '~to_y']]
paper_to_institution.columns = ['paper_id', 'institution_id']
paper_to_institution.head()

Unnamed: 0,paper_id,institution_id
0,a7a37b88-3851-4af2-b624-ead921eb37da,5f649689-473b-4e1b-93c3-5757cf0f75a1
1,1bad150b-32bd-46e4-8d56-b3097285819d,71e38b36-b115-45be-8e02-ece8defec621
2,520010ce-6cfe-4d1d-85c9-006ce29289ef,aa24f219-ca4f-44d6-987a-bb3d78da9732
3,52efaa04-db11-49d4-b543-b8b29f7a9f7e,4d98477b-5ec2-4aef-b903-faa4d0e229f9
4,52efaa04-db11-49d4-b543-b8b29f7a9f7e,c4ffabb7-f2a8-4015-ac28-d519d1fe39cf


In [44]:
# for each institution_id, get the name from institution_nodes and their country. Replace the institution_id with the name and add a country column
knowledge_graph_country = []
knowledge_graph_institution = []
for _, row in paper_to_institution.iterrows():
    paper_id = row['paper_id']
    institution_id = row['institution_id']
    
    # Get the insitution name
    institution_name = institution_nodes[institution_nodes['~id'] == institution_id]['institution:String'].values[0]

    # Get the institution country
    institution_country = institution_nodes[institution_nodes['~id'] == institution_id]['country:String'].values[0]

    # Get the paper title
    matching_rows = title_to_paper[title_to_paper['paper_id'] == paper_id]['title'].values

    if len(matching_rows) > 0:
        paper_title = matching_rows[0]
    else:
        continue

    # Create a triplet
    triplet = (paper_title, f'from country', institution_country)
    knowledge_graph_country.append(triplet)

    triplet = (paper_title, f'from institution', institution_name)
    knowledge_graph_institution.append(triplet)


print('length of knowledge graph country: ' + str(len(knowledge_graph_country)))
print('length of knowledge graph institution: ' + str(len(knowledge_graph_institution)))

# write the knowledge graph to a txt file
with open('KG_country.txt', 'w') as f:
    for triplet in knowledge_graph_country:
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')

with open('KG_institution.txt', 'w') as f:
    for triplet in knowledge_graph_institution:
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')





TypeError: can only concatenate str (not "int") to str

In [45]:
print('length of knowledge graph country: ' + str(len(knowledge_graph_country)))
print('length of knowledge graph institution: ' + str(len(knowledge_graph_institution)))

# write the knowledge graph to a txt file
with open('KG_country.txt', 'w') as f:
    for triplet in knowledge_graph_country:
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')

with open('KG_institution.txt', 'w') as f:
    for triplet in knowledge_graph_institution:
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')

length of knowledge graph country: 476865
length of knowledge graph institution: 476865


Author Knowledge graph

In [46]:
title_to_paper_file = './Data/Data/title_to_paper_map.json'
paper_to_author_file = './Data/Data/edges/paper_to_author.csv'
author_nodes_file = './Data/Data/nodes/paper_author_nodes.csv'

# Load the title to paper map
with open(title_to_paper_file) as f:
    title_to_paper = json.load(f)

# convert the json to a pandas dataframe
title_to_paper = pd.DataFrame(title_to_paper.items(), columns=['title', 'paper_id'])

# Load the author nodes
author_nodes = pd.read_csv(author_nodes_file)

# Load the paper to author edges
paper_to_author = pd.read_csv(paper_to_author_file)

In [47]:
print(len(author_nodes))
author_nodes.head()

234827


Unnamed: 0,~id,~label,first:String,last:String,email:String,suffix:String,full_name:String
0,cb49ec84-f706-4f00-bde9-906dcb4d4e33,Author,Rahman,Arafat,,,Rahman Arafat
1,ba40c47b-6088-4e90-887c-93e8a9683874,Author,,Oany,,,Oany
2,10630c66-de26-4bf1-afc1-c4c35782a235,Author,Tahmina,Pervin,,,Tahmina Pervin
3,b8bd4a32-5c68-4c7d-85b6-e31c9f49e705,Author,Mamun,Mia,,,Mamun Mia
4,22b3a4cf-b448-407f-95a3-8ce4d7ece014,Author,Motaher,Hossain,,,Motaher Hossain


In [48]:
print(len(paper_to_author))
paper_to_author.head()

340789


Unnamed: 0,~id,~label,~from,~to
0,bf5dab20-5552-4305-afce-ca3cdc286840,authored_by,a7a37b88-3851-4af2-b624-ead921eb37da,f9ddd3b1-ce1d-4f09-a462-dfb4d0bc3d14
1,df36b559-d9c6-4d3f-ab40-ed6c640d9650,authored_by,1bad150b-32bd-46e4-8d56-b3097285819d,23061148-48ab-43e8-bf36-522ae98fc0e1
2,49ad106d-be04-4fc9-9f2e-4b0d195347bd,authored_by,520010ce-6cfe-4d1d-85c9-006ce29289ef,3f572202-06e9-42f5-b002-e899d36dad4e
3,91601a14-34d4-4e23-a664-a2ecad5c98db,authored_by,95267dc1-8d53-4fd5-bc2f-ac809f0cbcdd,4b35351a-41fd-416b-bd62-7846e6219564
4,0ba81299-2353-4048-b62d-b8399a50c1fa,authored_by,cc6cd651-d5eb-49f3-a497-22f0c85fb960,fdea87e2-42e3-43bb-bf3c-9ea35facfae5


In [49]:
knowledge_graph_author = []

for _, row in paper_to_author.iterrows():
    paper_id = row['~from']
    author_id = row['~to']
    
    # Get the author name
    author_name = author_nodes[author_nodes['~id'] == author_id]['full_name:String'].values[0]

    # Get the paper title
    matching_rows = title_to_paper[title_to_paper['paper_id'] == paper_id]['title'].values

    if len(matching_rows) > 0:
        paper_title = matching_rows[0]
    else:
        continue

    # Create a triplet
    triplet = (paper_title, f'written by', author_name)
    knowledge_graph_author.append(triplet)

print('length of knowledge graph author: ' + str(len(knowledge_graph_author)))

# write the knowledge graph to a txt file
with open('KG_author.txt', 'w') as f:
    for triplet in tqdm(knowledge_graph_author):
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')

length of knowledge graph author: 335138


100%|██████████| 335138/335138 [00:00<00:00, 653062.50it/s]


Concept Knowledge graph

In [50]:
title_to_paper_file = './Data/Data/title_to_paper_map.json'
paper_to_concept_file = './Data/Data/edges/paper_to_concept.csv'
concept_nodes_file = './Data/Data/nodes/concept_nodes.csv'

# Load the title to paper map
with open(title_to_paper_file) as f:
    title_to_paper = json.load(f)

# convert the json to a pandas dataframe
title_to_paper = pd.DataFrame(title_to_paper.items(), columns=['title', 'paper_id'])

# Load the concept nodes
concept_nodes = pd.read_csv(concept_nodes_file)

# Load the paper to concept edges
paper_to_concept = pd.read_csv(paper_to_concept_file)

In [51]:
print(len(concept_nodes))
concept_nodes.head()

76804


Unnamed: 0,~id,~label,entity:String,concept:String
0,4a8cc024-1054-48a2-83fb-bc04dbe063da,Concept,dx name,SARS disease
1,ec39fbd1-445f-4638-9add-280c7176d474,Concept,system organ site,lung
2,194cc23f-9953-4051-943c-f6cfd7fd10b4,Concept,system organ site,s1 site
3,f8ac6eaa-0e8f-4e40-be73-e988a4a39f08,Concept,system organ site,hand
4,034db645-a28e-44a9-9881-c989d3647afb,Concept,system organ site,membrane


In [52]:
print(len(paper_to_concept))
paper_to_concept.head()

1836969


Unnamed: 0,~id,~label,~from,~to,score:Float
0,cd687ca0-ed29-4ea7-b853-2f099bf7d70d,associated_concept,387120fc-4292-4426-a406-f09a64513255,018d1317-06b0-4326-bf9b-9ac897c23250,0.943973
1,540ef8f6-dadb-41e8-97cd-43c13fd92a65,associated_concept,f2a54126-f4f8-496a-9933-456aec4bcd1b,ca014ee9-09d6-4af5-b498-0361b660a2a5,0.525614
2,df193c55-f7a7-4eb2-8c44-e77d1b4e4cf9,associated_concept,f3d0ffd4-1c16-402f-9d18-8333dc4fc45f,e12ea033-d5a8-4bf7-b6ca-f94abcaa23a6,0.911861
3,2bd93e96-e442-43d7-a41b-9e1125289041,associated_concept,665a4388-fba9-4520-a06a-3a528e3961c3,b3fd0907-dee9-4260-8ece-ac32b685767d,0.688432
4,f98467c1-dc79-401f-ba00-a7b7c68c07b9,associated_concept,7d18d7fd-532b-4d5d-80ec-d15090a4b751,bdbbfb9e-5a46-41e4-9457-e56f395597ba,0.620543


In [55]:
knowledge_graph_concept = []

for _, row in paper_to_concept.iterrows():
    paper_id = row['~from']
    concept_id = row['~to']
    score = row['score:Float']
    
    # Get the concept name
    matching_rows = concept_nodes[concept_nodes['~id'] == concept_id]['concept:String'].values

    if len(matching_rows) > 0:
        concept_name = matching_rows[0]
    else:
        continue

    # Get the paper title
    matching_rows = title_to_paper[title_to_paper['paper_id'] == paper_id]['title'].values

    if len(matching_rows) > 0:
        paper_title = matching_rows[0]
    else:
        continue

    # Create a triplet
    triplet = (paper_title, f'associated concept with score: {score}', concept_name)
    knowledge_graph_concept.append(triplet)

print('length of knowledge graph concept: ' + str(len(knowledge_graph_concept)))

# write the knowledge graph to a txt file
with open('KG_concept.txt', 'w') as f:
    for triplet in tqdm(knowledge_graph_concept):
        f.write(f'{triplet[0]},{triplet[1]},{triplet[2]}\n')

length of knowledge graph concept: 1783786


100%|██████████| 1783786/1783786 [00:02<00:00, 661769.07it/s]


In [54]:
concept_id

'142031e9-ff52-4ef9-8807-a1c2e4b1e501'

Sentiment analysis

In [3]:
import json
import os
from tqdm import tqdm

In [4]:
pdfs = os.listdir('document_parses/pdf_json')
pmcs = os.listdir('document_parses/pmc_json')

len(pdfs), len(pmcs)

(401212, 315742)

In [None]:
for pdf in pdfs:
    with open(f'document_parses/pdf_json/{pdf}') as f:
        data = json.load(f)
        title = data['metadata']['title']
