In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
from datetime import datetime
import time



In [5]:
# Function to parse JSON response
def parse_scopus_response(json_response):
    entries = []
    for entry in json_response.get('search-results', {}).get('entry', []):
        cover_date = entry.get('prism:coverDate')
        # Ensure cover_date is present and not in the future
        if cover_date and datetime.strptime(cover_date, "%Y-%m-%d").date() <= datetime.now().date():
            affiliations = entry.get('affiliation', [])
            affiliation_info = []
            for aff in affiliations:
                aff_name = aff.get('affilname', 'N/A')
                aff_city = aff.get('affiliation-city', 'N/A')
                aff_country = aff.get('affiliation-country', 'N/A')
                affiliation_info.append({
                    'name': aff_name,
                    'city': aff_city,
                    'country': aff_country
                })

            article = {
                'title': entry.get('dc:title'),
                'author': entry.get('dc:creator'),
                'publicationName': entry.get('prism:publicationName'),
                'cover_date': cover_date,
                'scopus_id': entry.get('dc:identifier'),
                'cited_by_count': entry.get('citedby-count'),
                'open_access': entry.get('openaccessFlag'),
                'eid' : entry.get('eid'),
                'aggregationType' : entry.get('prism:aggregationType', 'N/A'),
                'affiliations': affiliation_info,
                'link': entry.get('link')[2]['@href'] if 'link' in entry and len(entry['link']) > 0 else None
            }
            entries.append(article)
    return entries


In [6]:
# Function to fetch data from Scopus API with pagination
def fetch_scopus_data(api_key, query, max_records=1000, count=25):
    base_url = "https://api.elsevier.com/content/search/scopus"
    all_entries = []
    start_index = 0

    while len(all_entries) < max_records:
        params = {
            'query': query,
            'start': start_index,
            'count': count,
            'apiKey': api_key
        }
        headers = {'Accept': 'application/json'}
        response = requests.get(base_url, headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            articles = parse_scopus_response(json_response)
            if not articles:
                print("No more articles found or all articles have future dates.")
                break  # Stop if no articles are found in the response
            all_entries.extend(articles)
            start_index += count
        else:
            print(f"Error: {response.status_code} - {response.text}")
            break

        # Avoid exceeding the total max_records
        if len(all_entries) >= max_records:
            all_entries = all_entries[:max_records]
            break

    return all_entries

In [24]:
# Function to extract abstracts for all links in a DataFrame
def extract_abstracts(df):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }
    
    abstracts = []
    
    for index, row in df.iterrows():
        link = row['link']
        try:
            # Make a GET request to the article link
            response = requests.get(link, headers=headers)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Locate the section by id "abstractSection"
                abstract_section = soup.find('section', id='abstractSection')
                
                # Find all <p> tags within this section
                if abstract_section:
                    paragraphs = abstract_section.find_all('p')
                    
                    # Combine all text from the <p> tags
                    abstract_text = " ".join(p.get_text(strip=True) for p in paragraphs)
                    abstracts.append(abstract_text if abstract_text else "Abstract not available")
                else:
                    abstracts.append("Abstract section not found")
            elif response.status_code == 403:
                abstracts.append("Access forbidden - Possible login required")
            else:
                abstracts.append(f"Failed to fetch page (status {response.status_code})")
        
        except requests.exceptions.RequestException as e:
            abstracts.append(f"Request failed: {str(e)}")
        except Exception as e:
            abstracts.append(f"Error: {str(e)}")
        
        # Optional delay to avoid rate limiting
        time.sleep(5)
    
    # Add abstracts back to the DataFrame
    df['abstract'] = abstracts
    return df
# Fetch the abstracts and add them to the DataFrame
df_with_abstracts = extract_abstracts(df)

# Display the updated DataFrame
print(df_with_abstracts)


KeyboardInterrupt: 

In [11]:
import re
from playwright.sync_api import Page, expect

def extract_abstact(page: Page):
    page.goto(df['link'][0])
    expect(page).to_have_title(df['title'])


In [52]:
# Function to randomly pick articles
def pick_random_articles(articles, total=1000):
    if len(articles) < total:
        print(f"Warning: Only {len(articles)} articles available. Returning all.")
        return articles
    return random.sample(articles, total)

In [8]:
# Main script
if __name__ == "__main__":
    API_KEY = "c8690de363626d560cf56cc17f9369d6"
    QUERY = 'TITLE("data science")'
    TOTAL_ARTICLES = 1000

    # Step 1: Fetch data from API
    articles = fetch_scopus_data(API_KEY, QUERY, max_records=TOTAL_ARTICLES)

    # Step 2: Extract abstracts using EIDs
    # articles_with_abstracts = extract_abstracts_via_eid(articles)

    # Step 3: Save data to DataFrame
    df = pd.DataFrame(articles)
    # df = pd.DataFrame(articles_with_abstracts)
    print(df.head())  # Print the first few rows for validation

                                               title        author  \
0  Analysing trends of computational urban scienc...      Kumar D.   
1  A landmark federal interagency collaboration t...  Justice A.C.   
2  Regional planning: A failed or flawed project ...    Chirisa I.   
3  Data Science and Model Predictive Control:: A ...   Morato M.M.   
4  Assessment of the relationship between central...    Akabane S.   

                        publicationName  cover_date              scopus_id  \
0           Computational Urban Science  2024-12-01  SCOPUS_ID:85209789532   
1                            JAMIA Open  2024-12-01  SCOPUS_ID:85208963031   
2  Regional Science Policy and Practice  2024-12-01  SCOPUS_ID:85208099811   
3            Journal of Process Control  2024-12-01  SCOPUS_ID:85207933325   
4                    Scientific Reports  2024-12-01  SCOPUS_ID:85207210995   

  cited_by_count  open_access                 eid aggregationType  \
0              0         True  2-s2.0-852

In [65]:
df.shape

(1000, 11)

In [23]:
article_names = []

for article in articles:
    # If the article has a valid link, try to extract the abstract
    if df['link']:  # Use the 'link' directly for the abstract
        abstract = extract_abstracts(df)  # Pass the link to fetch abstract
        article_names.append(abstract)  # Collect abstracts
    else:
        article_names.append("No link available")  # Add a placeholder if no link is present

# Print the first 10 abstracts as a preview
print(article_names[:10])


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [16]:
df['link'][0]

'https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85209789532&origin=inward'

In [None]:
https://www.scopus.com/inward/record.uri?partn

In [None]:
import re
from playwright.sync_api import Page, expect

def extract_abstact(page: Page):
    page.goto(df['link'][0])
    expect(page).to_have_title(df['title'])


In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Example text
text = "Data Science in Supporting Hotel Management: An Application of Deep Learning Techniques in 2024!"

# Define a regular expression pattern to extract words
words = re.findall(r'\b[a-zA-Z]+\b', text)

# Remove stopwords by checking if each word is not in the stopword list
filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]

# Display the filtered words
print(filtered_words)

In [23]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [27]:
def clean_text(text):
    # Extract words (alphabetic characters only)
    words = re.findall(r'\b[a-zA-Z]+\b', text)
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]
    return filtered_words # Join words back into a string

# Apply the clean_text function to the 'title' and 'name' columns
df['cleaned_title'] = df['title'].apply(lambda x: clean_text(str(x)))
df['cleaned_publicationName'] = df['publicationName'].apply(lambda x: clean_text(str(x)))

# Show the updated DataFrame
print(df[['cleaned_title', 'cleaned_publicationName']])

                                         cleaned_title  \
0    [Data, Science, Supporting, Hotel, Management,...   
1    [Data, science, Analysis, Using, Deep, Learnin...   
2    [Data, science, based, reconstruction, D, memb...   
3    [Data, science, sustainable, entrepreneurship,...   
4    [Applying, data, science, methodologies, artif...   
..                                                 ...   
995  [Data, Science, literacy, future, security, in...   
996  [Predictive, modeling, heat, formation, sulfur...   
997  [Data, Science, Model, Predictive, Control, su...   
998  [Machine, learning, data, science, techniques,...   
999  [Harnessing, Data, Science, Produced, Water, E...   

                               cleaned_publicationName  
0           [Smart, Innovation, Systems, Technologies]  
1    [International, Conference, Knowledge, Enginee...  
2                         [Journal, Membrane, Science]  
3         [Technological, Forecasting, Social, Change]  
4               [A

In [38]:
output_filename = "scopus_data.csv"
df.to_csv(output_filename, index=False)

In [None]:
output_filename = "scopus_data.csv"
df.to_csv(output_filename, index=False)