In [4]:
import os
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import csv
import pandas as pd

### (Don't need to run this command I've already done for you) Extracted Link From the Given Json
This part is for extracting the specific link that contain "scopus.com" but exclude "citedby" 

Example Link with <a href="https://www.scopus.com/inward/citedby.uri?partnerID=HzOxMe3b&scp=85170238281&origin=inward">"citedby"</a> : <br>
<div style="text-align: center;">
  <img src="../image/with_citeby.png" alt="image" width="1000" height="500">
</div>
</br>
Example Link without <a href="https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85050336797&origin=inward">"citedby"</a> : <br>
<div style="text-align: center;">
  <img src="../image/without_citeby.png" alt="image" width="1000" height="500">
</div>

In [15]:
def extract_scopus_links(data, links=None):
    """
    Recursively extract all Scopus links from a JSON object, excluding those with 'citedbyresults'.
    """
    if links is None:
        links = set()

    if isinstance(data, dict):
        for key, value in data.items():
            extract_scopus_links(value, links)
    elif isinstance(data, list):
        for item in data:
            extract_scopus_links(item, links)
    elif isinstance(data, str) and "scopus.com" in data and "citedby" not in data:
        links.add(data)

    return links

def save_links_to_files(links, output_base, max_links_per_file):
    """
    Save links to multiple files, each containing up to `max_links_per_file` links.
    """
    links = list(links)
    total_links = len(links)
    num_files = (total_links // max_links_per_file) + (1 if total_links % max_links_per_file != 0 else 0)

    for i in range(num_files):
        start_idx = i * max_links_per_file
        end_idx = start_idx + max_links_per_file
        chunk_links = links[start_idx:end_idx]

        output_file = f"{output_base}_{i + 1}.txt"
        # Use utf-8 encoding when writing to the file
        with open(output_file, 'w', encoding='utf-8') as outfile:
            outfile.write("\n".join(chunk_links))
        print(f"Saved {len(chunk_links)} links to {output_file}.")

def process_folders(base_folder, output_base, max_links_per_file=200):
    """
    Process all files in subfolders, extracting Scopus links from files containing JSON content.
    """
    collected_links = set()

    # Walk through each folder and file
    for root, dirs, files in os.walk(base_folder):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"Processing folder: {root} | File: {file}")  # Print current folder and file being processed
            try:
                # Try to open and parse the file as JSON, even if it's not a .json file
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        # Try loading the content as JSON
                        json_data = json.load(f)

                        # Extract Scopus links
                        links = extract_scopus_links(json_data)
                        collected_links.update(links)
                    except json.JSONDecodeError:
                        # If the file content is not valid JSON, print a message
                        print(f"Skipping file (not valid JSON): {file_path}")
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    # Save collected links to multiple files
    save_links_to_files(collected_links, output_base, max_links_per_file)

# Step 1: Specify the folder where the extracted files are located
base_folder = '../Project'  # Replace with the path to your extracted files

# Step 2: Define the output base name for the link files
output_base = 'scopus_links_2023'  # Base name for output files

# Step 3: Process the folder and extract Scopus links, splitting into files with 200 links each
process_folders(base_folder, output_base, max_links_per_file=200)

# Step 4: Notify that the files are saved
print(f"Scopus links have been saved in batches with a maximum of 200 links per file.")

Processing folder: ../Project\2023 | File: 202300000
Processing folder: ../Project\2023 | File: 202300001
Processing folder: ../Project\2023 | File: 202300002
Processing folder: ../Project\2023 | File: 202300003
Processing folder: ../Project\2023 | File: 202300004
Processing folder: ../Project\2023 | File: 202300005
Processing folder: ../Project\2023 | File: 202300006
Processing folder: ../Project\2023 | File: 202300007
Processing folder: ../Project\2023 | File: 202300008
Processing folder: ../Project\2023 | File: 202300009
Processing folder: ../Project\2023 | File: 202300010
Processing folder: ../Project\2023 | File: 202300011
Processing folder: ../Project\2023 | File: 202300012
Processing folder: ../Project\2023 | File: 202300013
Processing folder: ../Project\2023 | File: 202300014
Processing folder: ../Project\2023 | File: 202300015
Processing folder: ../Project\2023 | File: 202300016
Processing folder: ../Project\2023 | File: 202300017
Processing folder: ../Project\2023 | File: 202

### WebScrapping from the extracted link
This part is for scraaping data including: 
- title
- authors
- article_info
- abstract
- categories/keyword
- citation_info
- document_info
- author_tags
- affiliations
- funding

<div style="text-align: center;">
  <img src="../image/label_1.png" alt="image" width="1000" height="500">
</div>
<div style="text-align: center;">
  <img src="../image/label_2.png" alt="image" width="1000" height="500">
</div>

In [22]:
def extract_data(url):
    print(f"Opening page: {url}")  # Print progress as soon as the page starts loading
    chrome_options = Options()
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration (useful for some environments)

# Specify path to chromedriver if necessary, e.g., 'path/to/chromedriver'
    driver = webdriver.Chrome(executable_path=r"C:\Users\ASUS\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe", options=chrome_options)

    try: 

        driver.get(url)
        soup = BeautifulSoup(driver.page_source, "lxml")
        data = {}

        # Extraction logic (same as your provided code)
        h2_elements = soup.select('h2')
        for h2 in h2_elements:
            inner_text = h2.get_text(strip=True)
            data["title"] = inner_text if inner_text else ""

        author_section = soup.find('section', {'id': 'authorlist'})
        if author_section:
            author_tags = author_section.find_all('span', {'class': 'previewTxt'})
            authors = [author_tag.get_text(strip=True) for author_tag in author_tags if author_tag.get_text(strip=True)]
            data["authors"] = authors if authors else []
        else:
            data["authors"] = []

        journal_info_span = soup.find('span', {'id': 'journalInfo'})
        data["article_info"] = journal_info_span.get_text(strip=True) if journal_info_span else ""

        abstract_section = soup.find('section', {'id': 'abstractSection'})
        if abstract_section:
            p_tag = abstract_section.find('p')
            data["abstract"] = p_tag.get_text(strip=True) if p_tag else ""
        else:
            data["abstract"] = ""

        span_tag = soup.find('span', {'id': 'guestAccessSourceTitle'})
        data["categories/keyword"] = span_tag.get_text(strip=True) if span_tag else ""

        citation_ul = soup.find('ul', {'id': 'citationInfo'})
        citation_info = {}
        citation_fields = ["ISSN", "Source Type", "Original Language"]
        if citation_ul:
            li_tags = citation_ul.find_all('li')
            for i, li in enumerate(li_tags):
                strong_tag = li.find('strong')
                if strong_tag:
                    strong_tag.extract()
                clean_text = li.get_text(strip=True)
                if i < len(citation_fields):
                    citation_info[citation_fields[i]] = clean_text
        data["citation_info"] = citation_info

        document_ul = soup.find('ul', {'id': 'documentInfo'})
        document_info = {}
        document_fields = ["Document Type", "Publisher"]
        if document_ul:
            li_tags = document_ul.find_all('li')
            for i, li in enumerate(li_tags):
                strong_tag = li.find('strong')
                if strong_tag:
                    strong_tag.extract()
                clean_text = li.get_text(strip=True)
                if i < len(document_fields):
                    document_info[document_fields[i]] = clean_text
        data["document_info"] = document_info if document_info else {}

        author_tags = soup.find_all('span', {'class': 'badges'})
        authors = [author_tag.get_text(strip=True) for author_tag in author_tags if author_tag.get_text(strip=True)]
        data["author_tags"] = authors if authors else []

        affiliation_section = soup.find('section', {'id': 'affiliationlist'})
        if affiliation_section:
            affiliation_tags = affiliation_section.find_all('li')
            affiliations = [affiliation_tag.get_text(strip=True) for affiliation_tag in affiliation_tags if affiliation_tag.get_text(strip=True)]
            data["affiliations"] = affiliations if affiliations else []
        else:
            data["affiliations"] = []

        funding_rows = soup.find_all('tr', {'class': 'lightGreyBorderBottom'})
        funding_data = []
        for funding_row in funding_rows:
            td_tags = funding_row.find_all('td')
            funding_info = {"Funding Sponsor": "", "Funding Number": "", "Acronym": ""}
            if len(td_tags) >= 3:
                funding_info["Funding Sponsor"] = td_tags[0].get_text(strip=True) if td_tags[0] else ""
                funding_info["Funding Number"] = td_tags[1].get_text(strip=True) if td_tags[1] else ""
                funding_info["Acronym"] = td_tags[2].get_text(strip=True) if td_tags[2] else ""
            funding_data.append(funding_info)
        data["funding"] = funding_data if funding_data else []
    except Exception as e:
        print(f"Error scraping {url}: {e}")  # Log errors during scraping
        data = {}
    finally:
        driver.quit()
    print(f"Finished scraping: {url}")  # Notify when scraping for this URL is complete
    return data


### Run This Command
1. Change ADD "ADD PATH/TO/.TXT" to the selected file path
2. Change the output file name

In [None]:
with open('data/extracted_2019/scopus_links_2019.txt_16.txt', 'r') as file:
    urls = [line.strip() for line in file if line.strip()]

# Test only the first 5 links
total_urls = len(urls)
test_urls = urls

# Extract data from the test URLs
all_data = []

for index, test_urls in enumerate(test_urls , start=1):
    print(f"\nProcessing URL {index}/{total_urls}...")
    try:
        extracted_data = extract_data(test_urls)  # Calls the function that now shows progress as pages open
        all_data.append(extracted_data)
    except Exception as e:
        print(f"Error processing URL {test_urls}: {e}")
    print(f"Completed URL {index}/{total_urls}.")

# Convert data to a pandas DataFrame
df = pd.DataFrame(all_data)

# Save to a CSV file
df.to_csv('test_output.csv', index=False)

print("Data extraction complete. Saved to output.csv.")



Processing URL 1/87...
Opening page: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85087225912&origin=inward
Finished scraping: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85087225912&origin=inward
Completed URL 1/87.

Processing URL 2/87...
Opening page: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85062345412&origin=inward
Finished scraping: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85062345412&origin=inward
Completed URL 2/87.

Processing URL 3/87...
Opening page: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85069180565&origin=inward
Finished scraping: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85069180565&origin=inward
Completed URL 3/87.

Processing URL 4/87...
Opening page: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85065545013&origin=inward
Finished scraping: https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85065545013&origin