## scrawl the information of the 20 most recent papers

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json

NUM_PAPERS = 20

# Path to the ChromeDriver executable that matches the installed Chrome browser version
chromedriver_path = "E:\\wymApp\\flask-llm-pdf-analyzer\\driver\\chromedriver.exe"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode

# Set up the ChromeDriver service
service = Service(chromedriver_path)

# Launch the Chrome browser
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL for Prof. Yang Jinglei's Google Scholar profile
url = "https://scholar.google.com/citations?view_op=list_works&hl=en&hl=en&user=7wCHYtgAAAAJ&sortby=pubdate"

# Load the Google Scholar profile page
driver.get(url)

# Wait for the page to load and find the list of papers
wait = WebDriverWait(driver, 10)
papers = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".gsc_a_tr")))

# create a json object to store the data
dataset = []

for i in range(NUM_PAPERS):
    title_element = driver.find_element("xpath", '//*[@id="gsc_a_b"]/tr[' + str(i+1) + ']/td[1]/a')
    title = title_element.text
    # jump to the new page
    title_element.click()
    wait.until(EC.staleness_of(title_element))
    # extract the coauthors and journal
    coauthors = driver.find_element("xpath", '//*[@id="gsc_oci_table"]/div[1]/div[2]').text
    # list of authors
    coauthor_list = coauthors.split(', ')
    journal = driver.find_element("xpath", '//*[@id="gsc_oci_table"]/div[3]/div[2]').text
    # output author, papers, journal in json format
    for author in coauthor_list:
        dataset.append({
            'author': author,
            'paper_title': title,
            'journal': journal
        })
    # jump back to the previous page
    driver.back()
# pretty print the json object
# write to data/author_graph.json
with open('data/author_graph.json', 'w') as outfile:
    json.dump(dataset, outfile, indent=4)

# Quit the browser
driver.quit()

ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

## construct neo4j graph demo

In [19]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
db_password = os.getenv("PAPER_MATCH_DB_PASSWORD")


In [20]:
import json

# Open the JSON file and load the data
with open('data/data.json', 'r') as file:
    dataset = json.load(file)

# Print the data
print(dataset)

[{'author': 'Miracle Hope Adegun', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Kit-Ying Chan', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Jie Yang', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Harun Venkatesan', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Eunyoung Kim', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building therm

In [21]:
from neo4j import GraphDatabase

# Connect to Neo4j
# uri = "neo4j+s://0c5ddeaa.databases.neo4j.io"  # Update with your Neo4j server URI
uri = "bolt://localhost:7687"
username = "neo4j"  # Update with your Neo4j username
password = db_password # Update with your Neo4j password

db_driver = GraphDatabase.driver(uri, auth=(username, password))

# Create the dataset in Neo4j
def create_dataset(tx, data):
    for record in data:
        tx.run(
            """
            MERGE (a:Author {name: $author})
            MERGE (p:Paper {title: $paper_title})
            MERGE (j:Journal {name: $journal})
            MERGE (a)-[:IS_AUTHOR]->(p)
            MERGE (p)-[:PUBLISHED_IN]->(j)
            """,
            author=record["author"],
            paper_title=record["paper_title"],
            journal=record["journal"]
        )

# Execute the dataset creation
with db_driver.session() as session:
    session.execute_write(create_dataset, dataset)


# Close the Neo4j driver
db_driver.close()