## scrawl the information of the 20 most recent papers

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import ipywidgets as widgets

NUM_PAPERS = 20

# Path to the ChromeDriver executable that matches the installed Chrome browser version
chromedriver_path = "D:\Hackust\chromedriver.exe"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode

# Set up the ChromeDriver service
service = Service(chromedriver_path)

# Launch the Chrome browser
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL for Prof. Yang Jinglei's Google Scholar profile
url = "https://scholar.google.com/citations?view_op=list_works&hl=en&hl=en&user=7wCHYtgAAAAJ&sortby=pubdate"
# get the url from ipywidget text box


# Load the Google Scholar profile page
driver.get(url)

# Wait for the page to load and find the list of papers
wait = WebDriverWait(driver, 10)
papers = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".gsc_a_tr")))

# create a json object to store the data
dataset = []

for i in range(NUM_PAPERS):
    title_element = driver.find_element("xpath", '//*[@id="gsc_a_b"]/tr[' + str(i+1) + ']/td[1]/a')
    title = title_element.text
    # jump to the new page
    title_element.click()
    wait.until(EC.staleness_of(title_element))
    # extract the coauthors and journal
    coauthors = driver.find_element("xpath", '//*[@id="gsc_oci_table"]/div[1]/div[2]').text
    # list of authors
    coauthor_list = coauthors.split(', ')
    journal = driver.find_element("xpath", '//*[@id="gsc_oci_table"]/div[3]/div[2]').text
    # output author, papers, journal in json format
    for author in coauthor_list:
        dataset.append({
            'author': author,
            'paper_title': title,
            'journal': journal
        })
    # jump back to the previous page
    driver.back()
# pretty print the json object
print(json.dumps(dataset, indent = 4))

# Quit the browser
driver.quit()

## construct neo4j graph demo

In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
db_password = os.getenv("DB_PASSWORD")


In [None]:
import json

# Open the JSON file and load the data
with open('data.json', 'r') as file:
    dataset = json.load(file)

# Print the data
print(dataset)

[{'author': 'Miracle Hope Adegun', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Kit-Ying Chan', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Jie Yang', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Harun Venkatesan', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'journal': 'Composites Part A: Applied Science and Manufacturing'}, {'author': 'Eunyoung Kim', 'paper_title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building therm

In [None]:
from neo4j import GraphDatabase

# Connect to Neo4j
uri = "neo4j+s://0c5ddeaa.databases.neo4j.io"  # Update with your Neo4j server URI
username = "neo4j"  # Update with your Neo4j username
password = db_password  # Update with your Neo4j password

db_driver = GraphDatabase.driver(uri, auth=(username, password))

# Create the dataset in Neo4j
def create_dataset(tx, data):
    for record in data:
        tx.run(
            """
            MERGE (a:Author {name: $author})
            MERGE (p:Paper {title: $paper_title})
            MERGE (j:Journal {name: $journal})
            MERGE (a)-[:IS_AUTHOR]->(p)
            MERGE (p)-[:PUBLISHED_IN]->(j)
            """,
            author=record["author"],
            paper_title=record["paper_title"],
            journal=record["journal"]
        )

# Execute the dataset creation
with db_driver.session() as session:
    session.execute_write(create_dataset, dataset)

# Close the Neo4j driver
driver.close()


KeyboardInterrupt



In [None]:
from scholarly import scholarly, ProxyGenerator

pg = ProxyGenerator()
pg.FreeProxies(timeout=10, wait_time=1200)
scholarly.use_proxy(pg)

def get_recent_publications(person_name):
    search_query = scholarly.search_author(person_name)
    author = scholarly.fill(next(search_query))
    publications = author["publications"]
    return publications[-11:-1]

# Example usage
person_name = "Jinglei Yang"
recent_publications = get_recent_publications(person_name)

print(recent_publications)

[{'container_type': 'Publication', 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>, 'bib': {'title': 'Anisotropic thermally superinsulating boron nitride composite aerogel for building thermal management', 'pub_year': '2023', 'citation': 'Composites Part A: Applied Science and Manufacturing 169, 107522, 2023'}, 'filled': False, 'author_pub_id': '7wCHYtgAAAAJ:8dzOF9BpDQoC', 'num_citations': 0}, {'container_type': 'Publication', 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>, 'bib': {'title': 'Development of Sulfonamide‐Functionalized Charge‐Reversal AIE Photosensitizers for Precise Photodynamic Therapy in the Acidic Tumor Microenvironment', 'pub_year': '2023', 'citation': 'Advanced Functional Materials, 2300746, 2023'}, 'filled': False, 'author_pub_id': '7wCHYtgAAAAJ:hvmnpdAuIbkC', 'num_citations': 0}, {'container_type': 'Publication', 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>

In [None]:
import json
print(json.dumps(recent_publications, indent=4))

In [None]:
import ipywidgets as widgets
# Create a text widget
text = widgets.Text(description="Input title")
# Display widget
text

Text(value='', description='Input title')

In [None]:
from scholarly import scholarly
import json

def get_cited_articles(publication_title):
    search_query = scholarly.search_pubs(publication_title)
    data = next(search_query)
    return data

# Example usage
publication_title = text.value
cited_articles = get_cited_articles(publication_title)

print(json.dumps(cited_articles, indent=4))

{
    "container_type": "Publication",
    "source": "PUBLICATION_SEARCH_SNIPPET",
    "bib": {
        "title": "Inner blast response of fiber reinforced aluminum tubes",
        "author": [
            "X Li",
            "R Xu",
            "X Zhang",
            "H Zhang",
            "J Yang"
        ],
        "pub_year": "2023",
        "venue": "International Journal of Impact \u2026",
        "abstract": "In this work, the dynamic failure of Fiber Reinforced Metal Tubes (FRMTs) under inner blast load was experimentally investigated. The FRMTs was prepared by winding the basalt fiber or H-glass fiber onto the aluminum lining through filament winding process. The influence of explosive mass, winding angle and number of layers on the failure modes of FRMTs were obtained and discussed. The results showed that the fiber reinforced metal tubes performed better anti-blast performance compared to metallic tubes, in terms of deformation amount"
    },
    "filled": false,
    "gsrank":

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import ipywidgets as widgets
from time import sleep

NUM_PAPERS = 20

# Path to the ChromeDriver executable that matches the installed Chrome browser version
chromedriver_path = "D:\Hackust\chromedriver.exe"

# Configure Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run Chrome in headless mode

# Set up the ChromeDriver service
service = Service(chromedriver_path)

# Launch the Chrome browser
driver = webdriver.Chrome(service=service, options=chrome_options)
url = "https://scholar.google.com" + cited_articles["citedby_url"]

driver.get(url)
# Wait for the page to load and find the list of papers
wait = WebDriverWait(driver, 10)
sleep(10)
driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import ipywidgets as widgets


# Path to the ChromeDriver executable that matches the installed Chrome browser version
chromedriver_path = "D:\Hackust\chromedriver.exe"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode

# Set up the ChromeDriver service
service = Service(chromedriver_path)

# Launch the Chrome browser
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL for Prof. Yang Jinglei's Google Scholar profile
url = "https://scholar.google.com/"
# get the url from ipywidget text box


# Load the Google Scholar profile page
driver.get(url)

# Wait for the page to load and find the list of papers
wait = WebDriverWait(driver, 10)
papers = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="gs_hdr_tsi"]')))




## constructing interactive demo using ipywidgets

In [6]:
import ipywidgets as widgets
# Create a text widget
text = widgets.Text(description="Input title")
# Display widget
text

Text(value='', description='Input title')

In [None]:
from selenium import webdriver
from urllib import parse
from time import sleep

class GetBibs():
  def __init__(self, driver_path, option_path, ie_search_url, gg_search_url) -> None:
    self.ie_search_url = ie_search_url
    self.gg_search_url = gg_search_url

    option = webdriver.ChromeOptions()
    # option.add_argument("--user-data-dir="+option_path)
    # option.add_argument("--headless")
    self.browser = webdriver.Chrome(options = option)   # open chrome
    self.browser.set_window_size(800,800)

  def get_bib_from_google_scholar(self, paper_title):
    strto_pn=parse.quote(paper_title)
    url = self.gg_search_url + strto_pn
    self.browser.get(url)
    # wait for the result to load
    bib = "have not found bib"
    for i in range(100):
      try:
        element=self.browser.find_element(By.CSS_SELECTOR,"[class='gs_r gs_or gs_scl']")
        element=element.find_element(By.CSS_SELECTOR, "[class=gs_fl gs_flb']")
        element=element.find_element(By.CSS_SELECTOR, "[class='gs_or_cit gs_nph']")
        element.click()
        break
      except:
        sleep(0.1)
    for i in range(100):
      try:
        element=self.browser.find_element(By.ID, "gs_citi")
        element=element.find_element(By.CSS_SELECTOR, "[class='gs_citi']")
        element.click()
        break
      except:
        sleep(0.1)
    for i in range(100):
      try:
        bib = self.browser.find_element(By.TAG_NAME, 'pre').text
        break
      except:
        sleep(0.1)
    return bib

  def get_bib(self, paper_title):
    return self.get_bib_from_google_scholar(paper_title)

driver_path = r'D:/Hackust/chromedriver.exe' # 浏览器驱动位置
option_path = r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data/" # 使浏览器能用你自定义的设置，否则Selenium创建的浏览器对象是默认设置，一些插件就不能用了
ie_search_url = r'https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=' # 在执行代码之前，先打开IEEE官网的搜索页面，把类似的网址复制到这里，等号=后面就是一会儿要搜索的内容
gg_search_url = r'https://scholar.google.com/scholar?hl=zh-CN&as_sdt=0%2C5&inst=1597255436240989024&q=' # 谷歌学术也是一样
get_bibs = GetBibs(driver_path, option_path, ie_search_url, gg_search_url)

paper_titles = { # 要爬取的论文，key用于标记，value是论文题目。下面是一些样例
  "1": "Inner blast response of fiber reinforced aluminum tubes",
}

for k in paper_titles.keys():
  bib = get_bibs.get_bib(paper_titles[k])
  print(k, bib)
  print()


1 have not found bib



## attempted selenium_stealth, but there is still google captcha

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService

from selenium_stealth import stealth
import random

path = "D:\\wym_project\\flask-llm-pdf-analyzer\\chromedriver.exe"
# create a new Service instance and specify path to Chromedriver executable
service = ChromeService(executable_path=path)

# Step 2: Change browser properties
# create a ChromeOptions object
options = webdriver.ChromeOptions()

#run in headless mode
options.add_argument("--headless")

# disable the AutomationControlled feature of Blink rendering engine
options.add_argument('--disable-blink-features=AutomationControlled')
 
# disable pop-up blocking
options.add_argument('--disable-popup-blocking')
 
# start the browser window in maximized mode
options.add_argument('--start-maximized')
 
# disable extensions
options.add_argument('--disable-extensions')
 
# disable sandbox mode
options.add_argument('--no-sandbox')
 
# disable shared memory usage
options.add_argument('--disable-dev-shm-usage')


# Set navigator.webdriver to undefined
# create a driver instance
driver = webdriver.Chrome(service=service, options=options)

# Change the property value of the navigator for webdriver to undefined
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")



# Step 3: Rotate user agents 
user_agents = [
    # Add your list of user agents here
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
]

# select random user agent
user_agent = random.choice(user_agents)

# pass in selected user agent as an argument
options.add_argument(f'user-agent={user_agent}')


# Step 4: Scrape using Stealth
#enable stealth mode
stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

# navigate to opensea
driver.get("https://scholar.google.com/scholar?cites=5237000301060453724&as_sdt=2005&sciodt=0,5&hl=en")
 
# Wait for page to load

# Take screenshot
driver.save_screenshot("opensea.png")
 
# Close browser
driver.quit()
