In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import os
import re
import csv
import pandas as pd


# utils

In [2]:

def extract_text_from_element(element):
    """Extracts all text from a given WebElement, ignoring HTML tags."""
    return element.text

# JACS

In [3]:
def scrape_jacs_article(url):
    # Initialize WebDriver
    driver = webdriver.Chrome()
    driver.get(url)
    
    # Wait for the page to load
    driver.implicitly_wait(10)
    
    # Locate the title element using the updated XPath
    title_xpath = '//*[@id="pb-page-content"]/div/main/article/div[4]/div[1]/div[1]/div/div/h1/span'
    title_element = driver.find_element(By.XPATH, title_xpath)
    title_text = title_element.text
    
    # Locate the abstract element using the specified XPath
    abstract_xpath = '//*[@id="pb-page-content"]/div/main/article/div[4]/div[1]/div[2]'
    abstract_element = driver.find_element(By.XPATH, abstract_xpath)
    abstract_text = abstract_element.text
    
    # Locate the main content element using the specified XPath
    main_content_xpath = '//*[@id="pb-page-content"]/div/main/article/div[4]/div[1]/div[3]/div[1]/div'
    main_content_element = driver.find_element(By.XPATH, main_content_xpath)
    main_text = main_content_element.text
    
    driver.quit()
    #print(title_text, abstract_text, main_text)
    # Return the extracted details
    return  title_text, abstract_text, main_text
    

# Naturecatalysis

In [4]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

def scrape_nature_catalysis_article(url):
    """Scrape article from Nature website."""
    # Initialize Chrome WebDriver
    driver = webdriver.Chrome()
    
    try:
        # Navigate to the specified URL
        driver.get(url)

        # Explicit wait for the title element to be present
        title_xpath = '//h1[@class="c-article-title"]'
        title_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, title_xpath))
        )
        title = title_element.text

        # Explicit wait for the abstract element to be present
        abstract_xpath = '//div[@id="Abs1-content"]'
        abstract_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, abstract_xpath))
        )
        abstract = abstract_element.text

        # Explicit wait for the main content element to be present
        main_content_xpath = '//*[@id="content"]/main/article/div[2]/div[2]'
        main_content_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, main_content_xpath))
        )
        maintext = main_content_element.text
        #print(title, abstract, maintext)
        # Return the extracted details 
        return title, abstract, maintext
        
        
    except NoSuchElementException as e:
        print("Error: An element was not found on the page.", str(e))
        return None
    except TimeoutException as e:
        print("Error: An element was not found within the given time frame.", str(e))
        return None
    finally:
        # Close the WebDriver
        driver.quit()


# Angewandte

In [5]:

# def scrape_angewandte_article(url):
#     """Scrape article from Angewandte website."""
#     # Initialize Chrome WebDriver
#     driver = webdriver.Chrome()
    
#     try:
#         # Navigate to the specified URL
#         driver.get(url)

#         # Locate the title using the specified XPath
#         title_element = driver.find_element(By.XPATH, '//*[@id="article__content"]/div[2]/div/h1')
#         title = title_element.text

#         # Clean the title: Replace spaces with underscores, remove special characters, and truncate to 20 characters
#         clean_title = re.sub(r'[^\w\s]', '', title).replace(' ', '_')[:20]
        
#         # Locate the abstract using its class-based XPath
#         abstract_element = driver.find_element(By.XPATH, '//h2[@class="article-section__header section__title main abstractlang_en main"]/following-sibling::div')
#         abstract = abstract_element.text
        
#         # Locate the full text using the specified XPath
#         full_text_element = driver.find_element(By.XPATH, '//*[@id="article__content"]')
#         maintext = full_text_element.text
        
#         return title, abstract, maintext
    
#     except NoSuchElementException:
#         print("Error: One of the elements was not found on the page.")
#         return None, None, None
#     finally:
#         # Close the WebDriver
#         driver.quit()



In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_angewandte_article(url):
    """Scrape article from Angewandte website."""
    # Initialize Chrome WebDriver
    driver = webdriver.Chrome()

    try:
        # Navigate to the specified URL
        driver.get(url)
        
        # Wait for the title element to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="article__content"]/div[2]/div/h1'))
        )
        
        # Extract the title using the provided XPath
        title_xpath = '//*[@id="article__content"]/div[2]/div/h1'
        title_element = driver.find_element(By.XPATH, title_xpath)
        title = title_element.text

        # Extract the abstract using the provided XPath
        abstract_xpath = '//*[@id="section-1-en"]/div'
        abstract_element = driver.find_element(By.XPATH, abstract_xpath)
        abstract = abstract_element.text
        
        # Extract the full text using the provided XPath
        full_text_xpath = '//*[@id="article__content"]'
        full_text_element = driver.find_element(By.XPATH, full_text_xpath)
        full_text = full_text_element.text
        
        # print(f"Title: {title}")
        # print(f"Abstract: {abstract}")
        # print(f"Main Text: {full_text[:100]}...")

        return title, abstract, full_text
    
    except NoSuchElementException as e:
        print("Error: An element was not found on the page.", str(e))
        return None, None, None
    finally:
        # Close the WebDriver
        driver.quit()



# Science

In [7]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

def scrape_science_article(url):
    """Scrape article from Science website."""
    # Initialize Chrome WebDriver
    driver = webdriver.Chrome()
    
    try:
        # Navigate to the specified URL
        driver.get(url)

        # Explicit wait for the title element to be present
        title_xpath = '//*[@id="main"]/div[1]/article/header/div/h1'
        title_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, title_xpath))
        )
        title = title_element.text

        # Explicit wait for the abstract element to be present
        abstract_xpath = '//*[@id="abstract"]'
        abstract_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, abstract_xpath))
        )
        abstract = abstract_element.text

        # Explicit wait for the main content element to be present
        fulltext_xpath = '//*[@id="bodymatter"]'
        fulltext_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, fulltext_xpath))
        )
        fulltext = fulltext_element.text

        # Return the extracted details
        return  title,  abstract, fulltext
        
        
    except NoSuchElementException as e:
        print("Error: An element was not found on the page.", str(e))
        return None
    except TimeoutException as e:
        print("Error: An element was not found within the given time frame.", str(e))
        return None
    finally:
        # Close the WebDriver
        driver.quit()



# Science Advances


In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_science_advances_article(url):
    """Scrape article from Science Advances website."""
    # Initialize Chrome WebDriver
    driver = webdriver.Chrome()

    try:
        # Navigate to the specified URL
        driver.get(url)
        
        # Wait for the title element to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div[1]/article/header/div/h1'))
        )
        
        # Extract the title using the provided XPath
        title_xpath = '//*[@id="main"]/div[1]/article/header/div/h1'
        title_element = driver.find_element(By.XPATH, title_xpath)
        title = title_element.text

        # Extract the abstract using the provided XPath
        abstract_xpath = '//*[@id="abstract"]'
        abstract_element = driver.find_element(By.XPATH, abstract_xpath)
        abstract = abstract_element.text
        
        # Extract the full text using the provided XPath
        full_text_xpath = '//*[@id="bodymatter"]/div'
        full_text_element = driver.find_element(By.XPATH, full_text_xpath)
        full_text = full_text_element.text
        
        # Return the extracted details
        return title, abstract, full_text
    
    except NoSuchElementException as e:
        print("Error: An element was not found on the page.", str(e))
        return None, None, None
    finally:
        # Close the WebDriver
        driver.quit()



# ACSNano


In [9]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

def scrape_acsnano_article(url):
    """Scrape article from ACS Nano website."""
    # Initialize Chrome WebDriver
    driver = webdriver.Chrome()
    
    try:
        # Navigate to the specified URL
        driver.get(url)

        # Extract the title using the provided XPath
        title_xpath = '//*[@id="pb-page-content"]/div/main/article/div[4]/div[1]/div[1]/div/div/h1/span'
        title_element = driver.find_element(By.XPATH, title_xpath)
        title = title_element.text

        # Extract the abstract using the provided XPath
        abstract_xpath = '//*[@id="pb-page-content"]/div/main/article/div[4]/div[1]/div[2]'
        abstract_element = driver.find_element(By.XPATH, abstract_xpath)
        abstract = abstract_element.text
        
        # Locate the full text element using the provided XPath
        full_text_xpath = '//*[@id="pb-page-content"]/div/main/article/div[4]/div[1]/div[3]'
        full_text_element = driver.find_element(By.XPATH, full_text_xpath)
        full_text = full_text_element.text
        
        # Return the extracted details
        return title, abstract, full_text
    
    except NoSuchElementException as e:
        print("Error: An element was not found on the page.", str(e))
        return None, None, None
    finally:
        # Close the WebDriver
        driver.quit()


# Advanced Materials

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_advanced_materials_article(url):
    """Scrape article from Advanced Materials website."""
    # Initialize Chrome WebDriver
    driver = webdriver.Chrome()

    try:
        # Navigate to the specified URL
        driver.get(url)
        
        # Wait for the title element to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="article__content"]/div[2]/div/h1'))
        )
        
        # Extract the title using the provided XPath
        title_xpath = '//*[@id="article__content"]/div[2]/div/h1'
        title_element = driver.find_element(By.XPATH, title_xpath)
        title = title_element.text

        # Extract the abstract using the provided XPath
        abstract_xpath = '//*[@id="article__content"]/div[5]/article/div[1]'
        abstract_element = driver.find_element(By.XPATH, abstract_xpath)
        abstract = abstract_element.text
        
        # Extract the full text using the provided XPath
        full_text_xpath = '//*[@id="article__content"]/div[5]/article/section'
        full_text_element = driver.find_element(By.XPATH, full_text_xpath)
        full_text = full_text_element.text
        
        # Return the extracted details
        return title, abstract, full_text
    
    except NoSuchElementException as e:
        print("Error: An element was not found on the page.", str(e))
        return None, None, None
    finally:
        # Close the WebDriver
        driver.quit()

# Example usage
# url = 'https://onlinelibrary.wiley.com/doi/full/10.1002/adma.201606793'
# title, abstract, full_text = scrape_advanced_materials_article(url)
# print(f"Title: {title}")
# print(f"Abstract: {abstract}")
# print(f"Main Text: {full_text[:100]}...")


# JMCA


In [11]:


def scrape_jmca_article(url):
    """Scrape article from the Journal of Materials Chemistry A (JMCA) website."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    service = Service()  # Using default ChromeDriver
    
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Set page load timeout

    try:
        print(f"Attempting to navigate to URL: {url}")
        driver.get(url)
        print(f"Successfully navigated to URL: {url}")
        
        # Wait for the body to be present
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        print("Body element found")
        
        # Extract the title from the <title> tag
        title = driver.title
        if title:
            title = title.split(' - ')[0].strip()
            print(f"Title extracted: {title}")
        else:
            print("Could not extract title from <title> tag")
        
        # # Extract the abstract
        # abstract_xpath = '//*[@id="wrapper"]/div[5]'
        # try:
        #     abstract_element = WebDriverWait(driver, 20).until(
        #         EC.presence_of_element_located((By.XPATH, abstract_xpath))
        #     )
        #     abstract = abstract_element.text
        #     print("Abstract extracted successfully")
        # except TimeoutException:
        #     abstract = "Abstract could not be extracted"
        #     print("Failed to extract abstract")
        
        # Extract text from <h2> elements and their corresponding content
        article_content = {}
        h2_elements = driver.find_elements(By.TAG_NAME, "h2")
        #print(f"Found {len(h2_elements)} h2 elements")
        
        for index, h2 in enumerate(h2_elements):
            section_title = h2.text
            #print(f"Processing section: {section_title}")
            section_content = ""
            try:
                next_element = h2.find_element(By.XPATH, "following-sibling::*")
                while next_element.tag_name != "h2":
                    if next_element.tag_name in ["p", "h3", "h4", "ul", "ol"]:
                        section_content += next_element.text + "\n"
                    try:
                        next_element = next_element.find_element(By.XPATH, "following-sibling::*")
                    except NoSuchElementException:
                        print(f"Reached end of content for section: {section_title}")
                        break
            except NoSuchElementException:
                print(f"No content found for section: {section_title}")
            
            article_content[section_title] = section_content.strip()
            print(f"Finished processing section {index + 1} of {len(h2_elements)}")
        
        if not article_content:
           # print("Article content extracted successfully")
        
            print("Failed to extract article content")
        if 'Introduction' in article_content:
            abstract = article_content['Introduction']
        elif '1. Introduction' in article_content:
            abstract = article_content['1. Introduction']
        else:
            abstract = None
        # convert json to str
        article_content = str(article_content)
        return title, abstract, article_content
    
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None, None, None
    finally:
        driver.quit()

# Example usage
# url_jmca = 'https://pubs.rsc.org/en/content/articlelanding/2015/ta/c4ta07214f'
# title, abstract, article_content = scrape_jmca_article(url_jmca)
# print(f"\nFinal Results:")
# print(f"Title: {title}")
# print(f"Abstract: {abstract[:100]}...")  # Print first 100 characters of abstract
# print("Article Content:", article_content)


# Elsevier

In [12]:


def scrape_elsevier_article(url):
    """Scrape article from Elsevier journal website."""
    chrome_options = Options()
    # chrome_options.add_argument("--headless")
    # chrome_options.add_argument("--disable-gpu")
    # chrome_options.add_argument("--no-sandbox")
    # chrome_options.add_argument("--disable-dev-shm-usage")
    # chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    service = Service()  # Using default ChromeDriver
    
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Set page load timeout

    try:
        driver.get(url)
        
        # # Debug: Print out the current page URL and HTML
        # print(f"Currently on URL: {driver.current_url}")
        # print("Page HTML:\n", driver.page_source[:2000])  # Print the first 2000 characters of the page's HTML
        
        # Iterate over iframes and try to find the content
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        print(f"Number of iframes found: {len(iframes)}")
        
        found = False
        for i, iframe in enumerate(iframes):
            driver.switch_to.frame(iframe)
            try:
                # Try finding the title in this iframe
                title_element = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "title-text"))
                )
                title = title_element.text.strip()
                #print(f"Title found in iframe {i}: {title}")
                
                # Try finding the abstract in this iframe
                abstract_element = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div.abstract p"))
                )
                abstract = abstract_element.text.strip()
                #print(f"Abstract found in iframe {i}: {abstract[:100]}...")  # Print the first 100 characters of the abstract
                
                # If both title and abstract are found, consider the content as found
                found = True
                break
            except (NoSuchElementException, TimeoutException):
                print(f"Content not found in iframe {i}. Moving to the next iframe.")
            finally:
                driver.switch_to.default_content()  # Switch back to the main content
                
        if not found:
            print("Content not found in any iframe. Checking main content.")

            # Retry in main content if no iframe worked
            title_element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "title-text"))
            )
            title = title_element.text.strip()
            #print(f"Title found: {title}")
            
            abstract_element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.abstract p"))
            )
            abstract = abstract_element.text.strip()
            #print(f"Abstract found: {abstract[:100]}...")  # Print the first 100 characters of the abstract

        # Extract all text from the article's main body by finding all <p> tags within the body
        body_element = driver.find_element(By.TAG_NAME, "body")
        paragraphs = body_element.find_elements(By.TAG_NAME, "p")
        full_text = "\n".join([p.text for p in paragraphs if p.text.strip()])  # Join all paragraph texts

        return title, abstract, full_text

    except (NoSuchElementException, TimeoutException) as e:
        print(f"An element was not found or took too long to load: {str(e)}")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None, None, None
    finally:
        driver.quit()

# Example usage
# url_elsevier = 'https://www.sciencedirect.com/science/article/pii/S0360319920342749'
# title, abstract, full_text = scrape_elsevier_article(url_elsevier)
# print(f"\nFinal Results:")
# print(f"Title: {title}")
# print(f"Abstract: {abstract[:100]}...")  # Print first 100 characters of abstract
# print(f"Full Text: {full_text[:500]}...")  # Print first 500 characters of full text


# Determine Journals

In [13]:

def clean_title(title):
    """Clean the title to 20 characters, removing special symbols."""
    cleaned_title = re.sub(r'[^\w\s]', '', title)  # Remove special symbols
    cleaned_title = cleaned_title[:20]  # Limit to 20 characters
    return cleaned_title


def save_to_file(filename, content):
    """Save content to a file."""
    with open(filename, 'w') as file:
        file.write(content)

def determine_journal_from_url(url):
    print(f"check url: {url}")
    if "acscatal" in url:
        return "ACS Catalysis"
    elif "ange" in url:
        return "Angewandte Chemie"
    elif "10.1002" in url or "adma" in url:
        return "Advanced Materials"
    elif "jacs" in url:
        return "Journal of the American Chemical Society"
    elif "pubs.acs.org" in url and ("nn" or "acsnano") in url:
        return "ACS Nano"
    elif "10.1021" in url:
        return "Journal of the American Chemical Society"
    elif '10.1126' in url:
        return "Science"
    elif '10.1126' in url and 'sciadv' in url:
        return "Science Advances"
    elif '/ta/' in url and 'pubs.rsc.org' in url:
        return "Journal of Materials Chemistry A"
    elif 'sciencedirect' in url and 'pii' in url:
        return "Elsevier"
    else:
        return "Unknown"
    
def search_scholar_and_scrape_related(query, num_related=10):
    # 初始化 WebDriver
    driver = webdriver.Chrome()
    
    try:
        # 打开 Google Scholar
        driver.get('https://scholar.google.com/')
        
        # 输入搜索查询并搜索
        search_box = driver.find_element(By.NAME, 'q')
        search_box.send_keys(query)
        search_box.submit()

        # 等待搜索结果加载
        search_results_xpath = '//h3[@class="gs_rt"]/a'
        search_results = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, search_results_xpath))
        )
        
        # 查找第一个结果的“Related articles”链接
        related_articles_xpath = '//a[contains(@href, "related:") and contains(text(), "相关文章")]'
        related_articles_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, related_articles_xpath))
        )
        related_articles_element.click()

        # 等待相关文章页面加载
        time.sleep(5)

        # 获取相关的文章链接
        length = min(len(driver.find_elements(By.XPATH, '//h3[@class="gs_rt"]/a')), num_related)
        related_article_links = driver.find_elements(By.XPATH, '//h3[@class="gs_rt"]/a')[:length]
        related_urls = [link.get_attribute('href') for link in related_article_links]
        
        print("Related URLs:")
        for url in related_urls:
            print(url)

        return related_urls

    finally:
        driver.quit()


def scrape_article(url, journal):
    
    """Scrape article based on the journal type."""
    if journal.lower() == 'naturecatalysis' or journal.lower() == 'nature catalysis':
        title, abstract, maintext = scrape_nature_catalysis_article(url)
    elif journal.lower() == 'jacs' or journal.lower() == 'journal of the american chemical society':
        title, abstract, maintext = scrape_jacs_article(url)
    elif journal.lower() == 'angewandte' or journal.lower() == 'angewandte chemie':
        title, abstract, maintext = scrape_angewandte_article(url)
    elif journal.lower() == 'science':
        title, abstract, maintext = scrape_science_article(url)
    elif journal.lower() == 'acsnano' or journal.lower() == 'acs nano':
        title, abstract, maintext = scrape_acsnano_article(url)
    elif journal.lower() == 'scienceadvances' or journal.lower() == 'science advances':
        title, abstract, maintext = scrape_science_advances_article(url)
    elif journal.lower() == 'advancedmaterials' or journal.lower() == 'advanced materials':
        title, abstract, maintext = scrape_advanced_materials_article(url)
    elif journal.lower() == 'journal of materials chemistry a':
        title, abstract, maintext = scrape_jmca_article(url)
    elif journal.lower() == 'elsevier':
        title, abstract, maintext = scrape_elsevier_article(url)
    else:
        print(f"Journal '{journal}' is not supported.")
        return

    print(f"scrapping {url} from {journal}")
    if title and abstract and maintext:
        clean_title = re.sub(r'[^\w\s]', '', title).replace(' ', '_')[:50]

        # print(f"Title: {title}")
        # print(f"Abstract: {abstract}")
        # print(f"Main Text: {maintext[:100]}...")

        # Ensure the documents directory exists
        documents_dir = os.path.join(os.getcwd(), 'documents')
        abstract_dir = os.path.join(documents_dir, 'abstract')
        maintext_dir = os.path.join(documents_dir, 'maintext')
        
        os.makedirs(abstract_dir, exist_ok=True)
        os.makedirs(maintext_dir, exist_ok=True)
        
        # Save abstract to a file
        abstract_filename = os.path.join(abstract_dir, f"{clean_title}_abs.txt")
        with open(abstract_filename, 'w', encoding='utf-8') as file:
            file.write(abstract)
            print(f"Abstract saved to {abstract_filename}")

        # Save maintext to a file
        maintext_filename = os.path.join(maintext_dir, f"{clean_title}_maintext.txt")
        with open(maintext_filename, 'w', encoding='utf-8') as file:
            file.write(maintext)
            print(f"Main text saved to {maintext_filename}")
        
        # Record the title and url in record.csv
        record_filename = os.path.join(documents_dir, 'record.csv')
        record_exists = os.path.isfile(record_filename)
        
        with open(record_filename, 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['title', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            if not record_exists:
                writer.writeheader()
            
            writer.writerow({'title': clean_title, 'url': url})
        return True
    else:
        print(f"Failed to scrape {url} from {journal}")
        return False

# 执行

In [15]:
from collections import deque
import os
import pandas as pd

# Define the path to your CSV file
csv_path = "documents/record.csv"

# Check if the file exists
if not os.path.exists(csv_path):
    # If the file doesn't exist, create it with the specified headers
    df = pd.DataFrame(columns=['title', 'url'])
    df.to_csv(csv_path, index=False)
else:
    # If the file exists, read it
    df = pd.read_csv(csv_path)

initial_urls = [
    'https://www.sciencedirect.com/science/article/pii/S0022286017300625'
   #'https://www.sciencedirect.com/science/article/pii/S0008622324007619'
     #'https://www.science.org/doi/full/10.1126/sciadv.1501122',
     # 'https://pubs.acs.org/doi/full/10.1021/nn901850u',
    # 'https://www.science.org/doi/full/10.1126/science.1168049',
    # 'https://onlinelibrary.wiley.com/doi/full/10.1002/ange.201600687',
    # 'https://pubs.acs.org/doi/10.1021/jacs.9b05006',
    # 'https://pubs.acs.org/doi/10.1021/acsnano.0c10165',
    # 'https://onlinelibrary.wiley.com/doi/full/10.1002/adma.202004670'
]

visited = set(df['url'].tolist())
queue = deque(initial_urls)
total_processed = 0
max_total = 1000

while queue and total_processed < max_total:
    initial_url = queue.popleft()
    
    try:
        # Skip the URL if it has already been visited

        
        # Attempt to find and scrape related URLs
        related_urls = search_scholar_and_scrape_related(initial_url, 40)
        print(f"Related URLs: {related_urls}")
        for related_url in related_urls:
            # Process each related URL only if it hasn't been visited yet
            
            if related_url not in visited and total_processed < max_total:
                journal = determine_journal_from_url(related_url)
                print(f"Journal: {journal}")
                
                # Scrape the article and only add to visited and queue if successful
                success = scrape_article(related_url, journal)
                visited.add(related_url) 
                if success:
                     # Add to visited after successful processing
                    queue.append(related_url)
                    total_processed += 1

                if total_processed >= max_total:
                    print("Reached the limit of 100 URLs. Exiting.")
                    break

    except Exception as e:
        print(f"An error occurred while processing {initial_url}: {e}")
        continue

print(f"Total URLs processed: {total_processed}")


Related URLs:
https://www.sciencedirect.com/science/article/pii/S0022286017300625
https://www.sciencedirect.com/science/article/pii/S002223131400516X
https://www.sciencedirect.com/science/article/pii/S1010603015302380
https://www.sciencedirect.com/science/article/pii/S0022231314007121
https://europepmc.org/article/med/27209728
https://www.sciencedirect.com/science/article/pii/S1386142507002600
https://www.sciencedirect.com/science/article/pii/S1386142516300063
https://www.sciencedirect.com/science/article/pii/S0003267012015917
https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/abs/10.1002/bio.2625
https://www.sciencedirect.com/science/article/pii/S0143720822008385
Related URLs: ['https://www.sciencedirect.com/science/article/pii/S0022286017300625', 'https://www.sciencedirect.com/science/article/pii/S002223131400516X', 'https://www.sciencedirect.com/science/article/pii/S1010603015302380', 'https://www.sciencedirect.com/science/article/pii/S0022231314007121', 'https://europepm

KeyboardInterrupt: 