In [17]:
import os
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import csv
import pandas as pd

In [3]:
def extract_scopus_links(data, links=None):
    """
    Recursively extract all Scopus links from a JSON object, excluding those with 'citedbyresults'.
    """
    if links is None:
        links = set()

    if isinstance(data, dict):
        for key, value in data.items():
            extract_scopus_links(value, links)
    elif isinstance(data, list):
        for item in data:
            extract_scopus_links(item, links)
    elif isinstance(data, str) and "scopus.com" in data and "citedby" not in data:
        links.add(data)

    return links

def process_folders(base_folder, output_file):
    """
    Process all files in subfolders, extracting Scopus links from files containing JSON content.
    """
    collected_links = set()

    # Walk through each folder and file
    for root, dirs, files in os.walk(base_folder):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"Processing folder: {root} | File: {file}")  # Print current folder and file being processed
            try:
                # Try to open and parse the file as JSON, even if it's not a .json file
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        # Try loading the content as JSON
                        json_data = json.load(f)

                        # Extract Scopus links
                        links = extract_scopus_links(json_data)
                        collected_links.update(links)
                    except json.JSONDecodeError:
                        # If the file content is not valid JSON, print a message
                        print(f"Skipping file (not valid JSON): {file_path}")
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    # Save all collected links to a single file
    with open(output_file, 'w') as outfile:
        outfile.write("\n".join(collected_links))

    print(f"Found {len(collected_links)} Scopus links (excluding 'citedby'). Saved to {output_file}.")

# Step 1: Specify the folder where the extracted files are located
base_folder = 'Project'  # Replace with the path to your extracted files

# Step 2: Define the output file path
output_file = 'scopus_links.txt'  # Output file path for saving the Scopus links

# Step 3: Process the folder and extract Scopus links
process_folders(base_folder, output_file)

# Step 4: Notify that the file is saved
print(f"Scopus links have been saved to {output_file}. You can open it locally.")


Processing folder: Project\2018 | File: 201800000
Processing folder: Project\2018 | File: 201800001
Processing folder: Project\2018 | File: 201800002
Processing folder: Project\2018 | File: 201800003
Processing folder: Project\2018 | File: 201800004
Processing folder: Project\2018 | File: 201800005
Processing folder: Project\2018 | File: 201800006
Processing folder: Project\2018 | File: 201800007
Processing folder: Project\2018 | File: 201800008
Processing folder: Project\2018 | File: 201800009
Processing folder: Project\2018 | File: 201800010
Processing folder: Project\2018 | File: 201800011
Processing folder: Project\2018 | File: 201800012
Processing folder: Project\2018 | File: 201800013
Processing folder: Project\2018 | File: 201800014
Processing folder: Project\2018 | File: 201800015
Processing folder: Project\2018 | File: 201800016
Processing folder: Project\2018 | File: 201800017
Processing folder: Project\2018 | File: 201800018
Processing folder: Project\2018 | File: 201800019


KeyboardInterrupt: 

'<!DOCTYPE html>\n<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->\n<head>\n<title>Attention Required! | Cloudflare</title>\n<meta charset="UTF-8" />\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\n<meta name="robots" content="noindex, nofollow" />\n<meta name="viewport" content="width=device-width,initial-scale=1" />\n<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css" />\n<!--[if lt IE 9]><link rel="stylesheet" id=\'cf_styles-ie-css\' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->\n<style>body{margin:0;padding:0}</style>\n\n\n<!--[if gte IE 10]><!-->\n<script>\n  if (!navigator.cookieEnabled) {\n    window.addEventListene

In [33]:
def extract_data(url):
    chrome_options = Options()
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration (useful for some environments)

# Specify path to chromedriver if necessary, e.g., 'path/to/chromedriver'
    driver = webdriver.Chrome(executable_path=r"C:\Users\ASUS\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe", options=chrome_options)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")
    data = {}

    # Extraction logic (same as your provided code)
    h2_elements = soup.select('h2')
    for h2 in h2_elements:
        inner_text = h2.get_text(strip=True)
        data["title"] = inner_text if inner_text else ""

    author_section = soup.find('section', {'id': 'authorlist'})
    if author_section:
        author_tags = author_section.find_all('span', {'class': 'previewTxt'})
        authors = [author_tag.get_text(strip=True) for author_tag in author_tags if author_tag.get_text(strip=True)]
        data["authors"] = authors if authors else []
    else:
        data["authors"] = []

    journal_info_span = soup.find('span', {'id': 'journalInfo'})
    data["article_info"] = journal_info_span.get_text(strip=True) if journal_info_span else ""

    abstract_section = soup.find('section', {'id': 'abstractSection'})
    if abstract_section:
        p_tag = abstract_section.find('p')
        data["abstract"] = p_tag.get_text(strip=True) if p_tag else ""
    else:
        data["abstract"] = ""

    span_tag = soup.find('span', {'id': 'guestAccessSourceTitle'})
    data["categories/keyword"] = span_tag.get_text(strip=True) if span_tag else ""

    citation_ul = soup.find('ul', {'id': 'citationInfo'})
    citation_info = {}
    citation_fields = ["ISSN", "Source Type", "Original Language"]
    if citation_ul:
        li_tags = citation_ul.find_all('li')
        for i, li in enumerate(li_tags):
            strong_tag = li.find('strong')
            if strong_tag:
                strong_tag.extract()
            clean_text = li.get_text(strip=True)
            if i < len(citation_fields):
                citation_info[citation_fields[i]] = clean_text
    data["citation_info"] = citation_info

    document_ul = soup.find('ul', {'id': 'documentInfo'})
    document_info = {}
    document_fields = ["Document Type", "Publisher"]
    if document_ul:
        li_tags = document_ul.find_all('li')
        for i, li in enumerate(li_tags):
            strong_tag = li.find('strong')
            if strong_tag:
                strong_tag.extract()
            clean_text = li.get_text(strip=True)
            if i < len(document_fields):
                document_info[document_fields[i]] = clean_text
    data["document_info"] = document_info if document_info else {}

    author_tags = soup.find_all('span', {'class': 'badges'})
    authors = [author_tag.get_text(strip=True) for author_tag in author_tags if author_tag.get_text(strip=True)]
    data["author_tags"] = authors if authors else []

    affiliation_section = soup.find('section', {'id': 'affiliationlist'})
    if affiliation_section:
        affiliation_tags = affiliation_section.find_all('li')
        affiliations = [affiliation_tag.get_text(strip=True) for affiliation_tag in affiliation_tags if affiliation_tag.get_text(strip=True)]
        data["affiliations"] = affiliations if affiliations else []
    else:
        data["affiliations"] = []

    funding_rows = soup.find_all('tr', {'class': 'lightGreyBorderBottom'})
    funding_data = []
    for funding_row in funding_rows:
        td_tags = funding_row.find_all('td')
        funding_info = {"Funding Sponsor": "", "Funding Number": "", "Acronym": ""}
        if len(td_tags) >= 3:
            funding_info["Funding Sponsor"] = td_tags[0].get_text(strip=True) if td_tags[0] else ""
            funding_info["Funding Number"] = td_tags[1].get_text(strip=True) if td_tags[1] else ""
            funding_info["Acronym"] = td_tags[2].get_text(strip=True) if td_tags[2] else ""
        funding_data.append(funding_info)
    data["funding"] = funding_data if funding_data else []
    driver.quit()

    return data


In [35]:
# with open('scopus_links.txt', 'r') as file:
#     urls = [line.strip() for line in file if line.strip()]

# # Extract data from all URLs
# all_data = []
# for url in urls:
#     try:
#         extracted_data = extract_data(url)
#         all_data.append(extracted_data)
#     except Exception as e:
#         print(f"Error processing URL {url}: {e}")

# # Convert data to a pandas DataFrame
# df = pd.DataFrame(all_data)

# # Save to a CSV file
# df.to_csv('output.csv', index=False)

# # Close the driver

# print("Data extraction complete. Saved to output.csv.")

with open('scopus_links.txt', 'r') as file:
    urls = [line.strip() for line in file if line.strip()]

# Test only the first 5 links
test_urls = urls

# Extract data from the test URLs
all_data = []
for url in test_urls:
    try:
        extracted_data = extract_data(url)
        all_data.append(extracted_data)
    except Exception as e:
        print(f"Error processing URL {url}: {e}")

# Convert data to a pandas DataFrame
df = pd.DataFrame(all_data)

# Save to a CSV file
df.to_csv('test_output.csv', index=False)

print("Data extraction for all URLs complete. Saved to test_output.csv.")


Error processing URL Results from Scopus, 2017 September 17 Retrieved from: https://scopus.com.: Message: invalid argument
  (Session info: chrome=131.0.6778.85)

Error processing URL Durango-Cohen, E.J., Balasubramanian, S.K., Effective segmentation of university alumni: Mining contribution data with finite-mixture models. Research in Higher Education 56 (2014), 78–104 Retrieved from http://www.scopus.com/inward/record.url?eid=2-s2.0-84901580676&partnerID=40&md5=b271fa0bfc55ea84b325b4fab5f01157.: Message: invalid argument
  (Session info: chrome=131.0.6778.85)

Data extraction for all URLs complete. Saved to test_output.csv.
