In [None]:
import os
import glob
import re
from bs4 import BeautifulSoup
import pandas as pd

# Set HTML folder path
folder_path = './pages2/'
file_list = sorted(glob.glob(os.path.join(folder_path, '*.htm')))

all_data = []
subject_count = {}  # Count the number of articles extracted from each discipline

for filepath in file_list:
    filename = os.path.basename(filepath)

    # Extract discipline name from file name (remove last digit)
    subject_match = re.match(r'^(.+?)(\d+)?\.htm$', filename)
    subject = subject_match.group(1).strip() if subject_match else 'Unknown'

    with open(filepath, 'r', encoding='utf-8') as file:
        html = file.read()

    soup = BeautifulSoup(html, 'html.parser')
    articles = soup.find_all('article', class_='MuiCard-root')
    
    # Record the number of articles extracted
    subject_count[subject] = subject_count.get(subject, 0) + len(articles)
    print(f"📄 {filename}（subject: {subject}）：extracted {len(articles)} articles")

    for article in articles:
        research_type = 'N/A'
        card_content = article.find('div', class_='MuiCardContent-root')
        if card_content:
            outer_divs = card_content.find_all('div', recursive=False)
            if outer_divs:
                inner_divs = outer_divs[0].find_all('div')
                if inner_divs:
                    research_type = inner_divs[0].text.strip()

        title_tag = article.find('h3')
        title = 'N/A'
        if title_tag:
            spans = title_tag.find_all('span')
            if len(spans) == 1:
                title = spans[0].text.strip()
            elif len(spans) >= 2:
                title = spans[-1].text.strip()
            else:
                title = title_tag.text.strip()

        link = 'N/A'
        parent = article
        while parent:
            if parent.name == 'a' and parent.has_attr('href'):
                link = parent['href']
                break
            parent = parent.parent


        review_info_tag = article.find('h4')
        peer_review_status = review_info_tag.text.strip() if review_info_tag else 'N/A'

        authors = []
        author_section = article.find('span', string=lambda s: s and ('Author' in s or 'Authors' in s))
        if author_section:
            parent = author_section.find_parent()
            author_spans = parent.find_all('span')[1:]
            authors = [a.text.strip() for a in author_spans]

        funders = []
        funder_section = article.find('span', string=lambda s: s and ('Funder' in s or 'Funders' in s))
        if funder_section:
            parent = funder_section.find_parent()
            funder_spans = parent.find_all('span')[1:]
            funders = [f.text.strip() for f in funder_spans]

        reviewers = []
        reviewer_section = article.find('span', string=lambda s: s and ('Peer Reviewer' in s or 'Peer Reviewers' in s))
        if reviewer_section:
            parent = reviewer_section.find_parent()
            reviewer_spans = parent.find_all('span')[1:]
            if reviewer_spans:
                reviewers = [r.text.strip() for r in reviewer_spans]
            else:
                sibling = reviewer_section.find_next_sibling()
                if sibling and isinstance(sibling, str):
                    reviewers = [sibling.strip()]

        time_tag = article.find('time')
        publish_date = time_tag['datetime'] if time_tag and time_tag.has_attr('datetime') else 'N/A'

        all_data.append({
            'filepath': os.path.basename(filepath),
            'subject': subject,
            'research_type': research_type,
            'title': title,
            'link': link,
            'peer_review_status': peer_review_status,
            'authors': '; '.join(authors),
            'funders': '; '.join(funders),
            'reviewers': '; '.join(reviewers),
            'publish_date': publish_date
        })

# 显示提取统计结果
print("\n Statistics on the number of articles extracted from each discipline：")
for subj, count in subject_count.items():
    print(f"{subj}: {count}")

# 保存为 CSV
df = pd.DataFrame(all_data)
df["publish_date"] = pd.to_datetime(df["publish_date"], errors="coerce")
df = df[df["publish_date"] <= pd.Timestamp("2025-03-31")]
df.to_csv('1output.csv', index=False, encoding='utf-8-sig')

print("\n results are recorded in 1output.csv")



In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# 1. Read article titles and links from CSV files
df = pd.read_csv("1output.csv")
article_info = list(zip(df['title'], df['article_link']))

results = []     
failures = []    

driver = webdriver.Chrome()

# 2. grab information
for idx, (title, link) in enumerate(article_info):
    print(f"\n[{idx+1}] article：{title}")
    if link == "N/A":
        print(" skip:no link")
        continue

    try:
        driver.get(link)
        time.sleep(5)  

        if idx == 0:
            input(" click: Accept cookies，and enter...")

        # click Authors Tab（if have）
        try:
            author_tab = driver.find_element(By.XPATH, '//button[.//span[text()="Authors"]]')
            driver.execute_script("arguments[0].scrollIntoView();", author_tab)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", author_tab)
            time.sleep(4)
            print(" success click Authors Tab")
        except Exception as e:
            print(" we dont find Authors")

        # wait authors information
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.MuiCard-root'))
            )
        except TimeoutException:
            print(" skip")
            failures.append({"title": title, "link": link, "reasons": "Time out Exception"})
            continue

        # grab authors information
        try:
            author_cards = driver.find_elements(By.CSS_SELECTOR, 'div.MuiCard-root')
            if not author_cards:
                print(" do not find authors information")
                failures.append({"title": title, "link": link, "reasons": "do not find authors information"})
                continue

            for card in author_cards:
                try:
                    name_el = card.find_element(By.TAG_NAME, 'h3')
                    author_name = name_el.text.strip()
                except:
                    author_name = "N/A"

                try:
                    children = card.find_elements(By.XPATH, "./*")
                    affiliations = []
                    for child in children:
                        tag = child.tag_name.lower()
                        text = child.text.strip()

                        if tag == 'h5' and 'role' in text.lower():
                            break

                        if tag == 'p':
                            if any(role in text.lower() for role in [
                                'writing', 'formal analysis', 'data curation', 'investigation',
                                'methodology', 'validation', 'resources', 'supervision', 'review', 'roles'
                            ]):
                                continue
                            if ',' in text:
                                affiliations.append(text)

                    if affiliations:
                        parts = affiliations[-1].rsplit(',', 1)
                        institution = parts[0].strip()
                        country = parts[1].strip()
                    else:
                        institution = "N/A"
                        country = "N/A"

                except Exception as e:
                    institution = "N/A"
                    country = "N/A"

                print(f" author_name: {author_name}")
                print(f" institution: {institution}")
                print(f" country: {country}")
                print("----")

                results.append({
                    "title": title,
                    "link": link,
                    "author_name": author_name,
                    "institution": institution,
                    "country": country
                })

        except Exception as e:
            print(" Failed to capture author card：", e)
            failures.append({"title": title, "link": link, "reasons": "Failed to capture author card"})


    except Exception as outer_e:
        print(f" Fail to capture author card：{outer_e}")
        failures.append({"title": title, "link": link, "reasons": str(outer_e)})
        continue

# 3. store results
driver.quit()

# keep success data
df_results = pd.DataFrame(results)
df_results.to_csv("2authors.csv", index=False, encoding="utf-8-sig")
print("\n Author information has been saved to 2authors.csv")

# store fail results
if failures:
    df_failures = pd.DataFrame(failures)
    df_failures.to_csv("failed_articles.csv", index=False, encoding="utf-8-sig")
    print(" The article that failed to capture has been saved to failed_articles.csv")
else:
    print(" All articles have been successfully crawled with no record of failures!")

In [None]:
import pandas as pd
import re
import csv
from datetime import datetime

# Read the original CSV (note: comma-separated)
df = pd.read_csv("1output.csv", sep=",")

# Convert the date column
df["Publication Date"] = pd.to_datetime(df["Publication Date"], errors="coerce")

# Filter: Keep only articles published on or before 2025-03-31
cutoff_date = pd.to_datetime("2025-03-31")
df = df[df["Publication Date"] <= cutoff_date]

# Prepare output results
results = []

# Regular expressions
version_pattern = re.compile(r'version (\d+)')
review_pattern = re.compile(
    r'(\d+) (approved with reservations|not approved|approved)', 
    re.IGNORECASE
)
awaiting_pattern = re.compile(r'peer review: awaiting peer review', re.IGNORECASE)

# Iterate over each row
for idx, row in df.iterrows():
    title = row.get("Title", "")
    link = row.get("Link", "")
    status = row.get("Peer Review Status", "")

    # 1. Maximum review version
    versions = version_pattern.findall(status)
    max_version = max(map(int, versions)) if versions else 0

    # 2. Count each review status
    approved = 0
    reservations = 0
    not_approved = 0
    total_reviewers = 0

    for count, label in review_pattern.findall(status):
        count = int(count)
        total_reviewers += count
        label = label.lower()
        if label == "approved":
            approved += count
        elif label == "approved with reservations":
            reservations += count
        elif label == "not approved":
            not_approved += count

    # 3. Whether there are still "awaiting review" records
    has_awaiting = "Yes" if awaiting_pattern.search(status) else "No"

    results.append([
        title,
        link,
        max_version,
        total_reviewers,
        approved,
        reservations,
        not_approved,
        has_awaiting
    ])

# Overwrite the original output file
with open("3peer_review_summary_detailed.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["title", "link", "max_version", "total_reviewers", "approved", "reservations", "not_approved", "has_awaiting"])
    writer.writerows(results)

print(" Done! Only records before or on 2025-03-31 were kept, and results saved to the output file.")


In [None]:
import pandas as pd
import time
import csv
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# Chunk configuration
chunk_size = 50

# Read CSV
df = pd.read_csv("1output.csv", sep=";")
titles = df["title"].tolist()
urls = df["link"].tolist()

results = []
errors = []

def extract_reviewer_info(title, url):
    options = Options()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    # options.add_argument("--headless")  # Run headless if needed
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)

        # Wait for the “Open Peer Review” section to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//h3[contains(text(), 'Open Peer Review')]"))
        )

        time.sleep(random.uniform(1.2, 2.4))

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Check if status is "awaiting peer review"
        awaiting_p = soup.find("p", string=lambda x: x and "awaiting peer review" in x.lower())
        if awaiting_p:
            results.append({
                "title": title,
                "link": url,
                "name": "",
                "affiliation": "",
                "country": "",
                "status": "Awaiting Review"
            })
            return True

        # Extract reviewer info normally
        li_tags = soup.find_all("li", class_=lambda c: c and "MuiListItem-root" in c and "MuiListItem-gutters" in c)
        success_flag = False

        for li in li_tags:
            spans = li.find_all("span", class_=lambda c: c and "MuiTypography-body2" in c)
            if len(spans) < 2:
                continue

            name = spans[0].get_text(strip=True)
            aff_text = spans[1].get_text(strip=True)
            if aff_text.startswith(","):
                aff_text = aff_text[1:].strip()

            parts = [p.strip() for p in aff_text.split(",")]
            if len(parts) >= 2:
                country = parts[-1]
                institution = parts[0]
            else:
                country = ""
                institution = aff_text

            results.append({
                "title": title,
                "link": url,
                "name": name,
                "affiliation": institution,
                "country": country,
                "status": "Normal"
            })
            success_flag = True

        return success_flag

    except Exception as e:
        print(f" Error: {title} → {url} → {e}")
        return False

    finally:
        driver.quit()

# Main loop in chunks
total = len(titles)
for start in range(0, total, chunk_size):
    end = min(start + chunk_size, total)
    print(f"\n Processing articles {start + 1} to {end}...\n")

    for idx in range(start, end):
        title = titles[idx]
        url = urls[idx]

        print(f" Article {idx + 1}: {title}")
        time.sleep(random.uniform(1.1, 2.2))

        success = extract_reviewer_info(title, url)

        if not success:
            retry_url = url.rstrip("/") + "/v1"
            print(f" Trying fallback link /v1: {retry_url}")
            time.sleep(random.uniform(1.1, 2.2))

            retry_success = extract_reviewer_info(title, retry_url)

            if not retry_success:
                print(f" Final failure: {title}")
                errors.append({
                    "title": title,
                    "original link": url,
                    "tried link": retry_url,
                    "error": "both attempts failed"
                })

# Save successful data
pd.DataFrame(results).to_csv("4reviewer_info_full.csv", index=False)
print("\n Success data saved to 4reviewer_info_full.csv")

# Save error log
if errors:
    pd.DataFrame(errors).to_csv("4reviewer_info_errors.csv", index=False)
    print(" Error log saved to 4reviewer_info_errors.csv")
else:
    print(" No failed records!")


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv

# Set up browser options
options = Options()
# options.add_argument('--headless')  # Uncomment for headless mode
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080')
driver = webdriver.Chrome(options=options)

# Read input file
df = pd.read_csv("1output.csv")
titles = df["title"].tolist()
urls = df["link"].tolist()

# Output CSV file
with open("5reviewers_rounds.csv", "w", newline='', encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["title", "link", "version", "version date", "reviewer ID", "status"])

    for i, (title, url) in enumerate(zip(titles, urls)):
        print(f" Processing article {i+1}: {title}")
        try:
            driver.get(url)
            time.sleep(2)  # Wait for page to load

            soup = BeautifulSoup(driver.page_source, "html.parser")
            version_headers = soup.find_all("th", {"class": lambda x: x and "MuiTableCell-head" in x})

            for th in version_headers:
                # Check if this is a Version row
                span = th.find("span", {"class": lambda x: x and "MuiButton-label" in x})
                if not span or "version" not in span.text:
                    continue
                version_text = span.text.strip()

                # Find date
                time_tag = th.find("time")
                version_date = time_tag["datetime"].strip() if time_tag else ""

                # Get all <td> cells in this version row
                row = th.find_parent("tr")
                tds = row.find_all("td")

                # Check if all reviewer cells are empty (possibly "awaiting" status)
                all_empty = True
                for td in tds:
                    td_html = str(td)
                    if "<title>" in td_html:
                        all_empty = False
                        break

                # If all reviewer cells are empty, try detecting "awaiting peer review" message
                if all_empty:
                    p = soup.find("p", string="awaiting peer review")
                    if p:
                        writer.writerow([title, url, version_text, version_date, "", "awaiting"])
                        continue  # Skip to next version
                    else:
                        # Fallback: write empty reviewer rows
                        for idx, td in enumerate(tds):
                            reviewer_id = idx + 1
                            writer.writerow([title, url, version_text, version_date, reviewer_id, ""])
                        continue

                # Otherwise, parse actual review statuses
                for idx, td in enumerate(tds):
                    reviewer_id = idx + 1
                    td_html = str(td)

                    # Strictly match empty reviewer structure
                    if td_html.strip() == '<td class="MuiTableCell-root MuiTableCell-body"></td>':
                        next_td_html = (
                            str(tds[idx + 1]).strip()
                            if idx + 1 < len(tds)
                            else ""
                        )
                        if next_td_html == '<td class="MuiTableCell-root MuiTableCell-body"></td>':
                            writer.writerow([title, url, version_text, version_date, reviewer_id, ""])
                            continue

                    # Match review status string
                    if "<title>Approved</title>" in td_html:
                        status = "Approved"
                    elif "<title>Approved with Reservations</title>" in td_html:
                        status = "Approved with Reservations"
                    elif "<title>Not Approved</title>" in td_html:
                        status = "Not Approved"
                    else:
                        status = ""

                    writer.writerow([title, url, version_text, version_date, reviewer_id, status])

        except Exception as e:
            print(f" Error ({title}): {e}")
            writer.writerow([title, url, "ERROR", str(e), "", ""])

driver.quit()
print(" Output saved to: 5reviewers_rounds.csv")


In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv

# Read full link dataset
df = pd.read_csv("1output.csv")  # Full file
df_subset = df  # No slicing, process all

# List of failed records
failed_rows = []

# Output file
with open("6stats_all.csv", mode="w", newline='', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["title", "url", "views", "downloads", "citations"])

    for idx, row in df_subset.iterrows():
        title = row["title"]
        url = row["link"]
        print(f" Processing article {idx+1}: {title}")

        # Create a new Chrome driver for each article
        chrome_options = Options()
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        # chrome_options.add_argument("--headless")  # Uncomment for headless mode
        driver = webdriver.Chrome(options=chrome_options)

        try:
            driver.get(url)
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.TAG_NAME, "h3"))
            )

            soup = BeautifulSoup(driver.page_source, "html.parser")
            views = downloads = citations = "0"

            for li in soup.find_all("li"):
                title_attr = li.get("title", "")
                if "Page views" in title_attr:
                    views = li.get_text(strip=True).replace("views", "")
                elif "downloads" in title_attr:
                    downloads = li.get_text(strip=True).replace("downloads", "")
                elif "Crossref" in title_attr:
                    citations = li.get_text(strip=True).replace("citations", "")

            writer.writerow([title, url, views, downloads, citations])
            print(f" Success: {views} views, {downloads} downloads, {citations} citations")

        except Exception as e:
            print(f" Error ({title}): {e}")
            writer.writerow([title, url, "ERROR", "ERROR", "ERROR"])
            failed_rows.append([title, url, str(e)])  # Log failure

        finally:
            driver.quit()

# Save failure log
if failed_rows:
    with open("6failed_stats.csv", mode="w", newline='', encoding="utf-8") as f_error:
        error_writer = csv.writer(f_error)
        error_writer.writerow(["title", "url", "error"])
        error_writer.writerows(failed_rows)

print("All processing completed!")


In [None]:
import pandas as pd

# Read the original file
df = pd.read_csv("1output.csv", sep=";")

# Initialize two output lists
funders_rows = []
count_rows = []

for _, row in df.iterrows():
    title = row["title"]
    link = row["link"]
    pub_time = row["publish_date"]
    funders_raw = str(row["funders"]).strip()

    # Split and clean funders
    if funders_raw.lower() in ["nan", "", "none"]:
        funders_list = []
    else:
        funders_list = [f.strip() for f in funders_raw.split(";") if f.strip()]

    count = len(funders_list)

    # Record funder count per article
    count_rows.append([title, link, pub_time, count])

    # Record each funder entry
    if count == 0:
        funders_rows.append([title, link, pub_time, count, ""])
    else:
        for funder in funders_list:
            funders_rows.append([title, link, pub_time, count, funder])

# Save funder details (one row per funder)
df_funders = pd.DataFrame(funders_rows, columns=["title", "link", "publish_date", "funder_count", "funder_name"])
df_funders.to_csv("7funders_extracted.csv", index=False)

# Save count of funders per article
df_count = pd.DataFrame(count_rows, columns=["title", "link", "publish_date", "funder_count"])
df_count.to_csv("7funders_count_per_article.csv", index=False)

