In [1]:
import semantic_scholar_api as ss_api
from neo4j import GraphDatabase
import neo4j_utils as nu
import pandas as pd
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = 'C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_titles.txt'
with open(file_path, 'r') as file:
    paper_titles = [line.strip() for line in file]

data = []
for title in tqdm(paper_titles):
    try:
        paper_data = ss_api.exponential_backoff_retry(ss_api.search_paper_by_title, title)
        if paper_data:
            # print(json.dumps(paper_data, indent=2))
            data.append(paper_data)
    except ss_api.RateLimitExceededError:
        print("Exceeded rate limit. Please try again later.")
    except Exception as e:
        print(f"An error occurred: {e}")

df = pd.json_normalize(data)

100%|██████████| 24/24 [00:09<00:00,  2.63it/s]


In [3]:
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neo4j_is_the_best"))
cited_papers = {}

with driver.session() as session:
    for i, paper in tqdm(enumerate(data)):
        if paper is None:
            continue

        paper_properties = {key: value for key, value in paper.items() if key != 'authors'}
        paper_properties['level'] = 1
        paper_node = nu.get_or_create_paper_node(session, paper_properties)
        
        for author in paper['authors']:
            author_node = nu.get_or_create_author_node(session, author)
            session.execute_write(
                nu.create_authored_rel,
                {"paperId": paper['paperId']},
                {"authorId": author['authorId']}
            )
    for i in tqdm(range(len(data))):
        try:
            citation_data = ss_api.exponential_backoff_retry(ss_api.get_paper_references, df.loc[i].paperId, fields=["title", "abstract", "citationCount", "publicationDate"])
            for cited_paper in citation_data['data']:
                cited_paper = cited_paper['citedPaper']
                query = "MATCH (p:Paper {paperId: $paperId}) RETURN p"
                result = session.run(query, paperId=cited_paper['paperId'])
                cited_paper_node = result.single()
                if cited_paper_node is not None:
                    session.execute_write(
                        nu.create_cites_rel,
                        {"paperId": df.loc[i].paperId},
                        {"paperId": cited_paper['paperId']}
                    )
                else:
                    cited_paper['level'] = 2
                    cited_paper_node = nu.get_or_create_paper_node(session, cited_paper)
                    session.execute_write(
                        nu.create_cites_rel,
                        {"paperId": df.loc[i].paperId},
                        {"paperId": cited_paper['paperId']}
                    )                 
        except ss_api.RateLimitExceededError:
            print("Exceeded rate limit. Please try again later.")
        except Exception as e:
            print(f"An error occurred: {e}")

24it [00:03,  6.45it/s]
100%|██████████| 24/24 [00:36<00:00,  1.51s/it]


In [4]:
driver.close()

: 