In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
from datetime import datetime
import time

In [3]:
# Function to parse JSON response
def parse_scopus_response(json_response):
    entries = []
    for entry in json_response.get('search-results', {}).get('entry', []):
        cover_date = entry.get('prism:coverDate')
        # Ensure cover_date is present and not in the future
        if cover_date and datetime.strptime(cover_date, "%Y-%m-%d").date() <= datetime.now().date():
            affiliations = entry.get('affiliation', [])
            affiliation_info = []
            for aff in affiliations:
                aff_name = aff.get('affilname', 'N/A')
                aff_city = aff.get('affiliation-city', 'N/A')
                aff_country = aff.get('affiliation-country', 'N/A')
                affiliation_info.append({
                    'name': aff_name,
                    'city': aff_city,
                    'country': aff_country
                })

            article = {
                'title': entry.get('dc:title'),
                'author': entry.get('dc:creator'),
                'publicationName': entry.get('prism:publicationName'),
                'cover_date': cover_date,
                'scopus_id': entry.get('dc:identifier'),
                'cited_by_count': entry.get('citedby-count'),
                'open_access': entry.get('openaccessFlag'),
                'eid' : entry.get('eid'),
                'aggregationType' : entry.get('prism:aggregationType', 'N/A'),
                'affiliations': affiliation_info,
                'link': entry.get('link')[2]['@href'] if 'link' in entry and len(entry['link']) > 0 else None
            }
            entries.append(article)
    return entries


In [4]:
# Function to fetch data from Scopus API with pagination
def fetch_scopus_data(api_key, query, max_records=1000, count=25):
    base_url = "https://api.elsevier.com/content/search/scopus"
    all_entries = []
    start_index = 0

    while len(all_entries) < max_records:
        params = {
            'query': query,
            'start': start_index,
            'count': count,
            'apiKey': api_key
        }
        headers = {'Accept': 'application/json'}
        response = requests.get(base_url, headers=headers, params=params)

        if response.status_code == 200:
            json_response = response.json()
            articles = parse_scopus_response(json_response)
            if not articles:
                print("No more articles found or all articles have future dates.")
                break  # Stop if no articles are found in the response
            all_entries.extend(articles)
            start_index += count
        else:
            print(f"Error: {response.status_code} - {response.text}")
            break

        # Avoid exceeding the total max_records
        if len(all_entries) >= max_records:
            all_entries = all_entries[:max_records]
            break

    return all_entries

In [29]:
# Main script
if __name__ == "__main__":
    API_KEY = "c8690de363626d560cf56cc17f9369d6"
    QUERY = 'TITLE("data science")'
    TOTAL_ARTICLES = 1100

    # Step 1: Fetch data from API
    articles = fetch_scopus_data(API_KEY, QUERY, max_records=TOTAL_ARTICLES)

    # Step 3: Save data to DataFrame
    df = pd.DataFrame(articles)
    # df = pd.DataFrame(articles_with_abstracts)
    print(df.head())  # Print the first few rows for validation

                                               title        author  \
0  Analysing trends of computational urban scienc...      Kumar D.   
1  A landmark federal interagency collaboration t...  Justice A.C.   
2  Regional planning: A failed or flawed project ...    Chirisa I.   
3  Data Science and Model Predictive Control:: A ...   Morato M.M.   
4  Assessment of the relationship between central...    Akabane S.   

                        publicationName  cover_date              scopus_id  \
0           Computational Urban Science  2024-12-01  SCOPUS_ID:85209789532   
1                            JAMIA Open  2024-12-01  SCOPUS_ID:85208963031   
2  Regional Science Policy and Practice  2024-12-01  SCOPUS_ID:85208099811   
3            Journal of Process Control  2024-12-01  SCOPUS_ID:85207933325   
4                    Scientific Reports  2024-12-01  SCOPUS_ID:85207210995   

  cited_by_count  open_access                 eid aggregationType  \
0              0         True  2-s2.0-852

In [30]:
import os
script_dir = os.getcwd()
print(script_dir)
path = os.path.join(script_dir,"../results/fetched_scopus.csv")

df = pd.read_csv(path)

/Users/dear/Data Science/Project/Data-Sci-project/src


In [31]:
df.shape

(1100, 11)

In [32]:
# Function to fetch abstract from a link
def fetch_abstract(link,delay):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }
    try:
        # Request the page
        response = requests.get(link, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the section with id="abstractSection"
            abstract_section = soup.find('section', id='abstractSection')
            if abstract_section:
                # Find all <p> tags within this section
                paragraphs = abstract_section.find_all('p')
                # Combine all text from the <p> tags
                return " ".join(p.get_text(strip=True) for p in paragraphs)
            else:
                return "Abstract section not found"
        elif response.status_code == 403:
            return "Access forbidden - Possible login required"
        elif response.status_code == 429:
            print("Rate limit hit. Retrying ")
            time.sleep(delay)
            delay *= 2
        else:
            return f"Failed to fetch page (status {response.status_code})"
    except requests.exceptions.RequestException as e:
        return f"Request failed: {str(e)}"
    except Exception as e:
        return f"Error: {str(e)}"


In [38]:
batch_extract = []

In [88]:
def extract_abstracts_from_df(df):
    delay = 50
    for i in range(60,80):
        abstract = fetch_abstract(df['link'][i],delay)
        batch_extract.append(abstract)
        print(abstract)
    return  batch_extract
extract_abstracts_from_df(df)

In [None]:
df['abstract'] = batch_extract

In [103]:
df_cleaned = df[df.abstract != '[No abstract available]'] 
df_cleaned = df_cleaned.dropna(subset='abstract')
df_cleaned

Unnamed: 0,title,author,publicationName,cover_date,scopus_id,cited_by_count,open_access,eid,aggregationType,affiliations,link,abstract
0,Analysing trends of computational urban scienc...,Kumar D.,Computational Urban Science,2024-12-01,SCOPUS_ID:85209789532,0,True,2-s2.0-85209789532,Journal,[{'name': 'State University of New York Albany...,https://www.scopus.com/inward/record.uri?partn...,Urban computing with a data science approaches...
1,A landmark federal interagency collaboration t...,Justice A.C.,JAMIA Open,2024-12-01,SCOPUS_ID:85208963031,0,True,2-s2.0-85208963031,Journal,"[{'name': 'VA Connecticut Healthcare System', ...",https://www.scopus.com/inward/record.uri?partn...,"Objectives: In 2016, the Department of Veteran..."
2,Regional planning: A failed or flawed project ...,Chirisa I.,Regional Science Policy and Practice,2024-12-01,SCOPUS_ID:85208099811,0,True,2-s2.0-85208099811,Journal,"[{'name': 'University of the Free State', 'cit...",https://www.scopus.com/inward/record.uri?partn...,This paper explores regional planning as an ap...
3,Data Science and Model Predictive Control:: A ...,Morato M.M.,Journal of Process Control,2024-12-01,SCOPUS_ID:85207933325,0,False,2-s2.0-85207933325,Journal,"[{'name': 'Université Grenoble Alpes', 'city':...",https://www.scopus.com/inward/record.uri?partn...,Model Predictive Control (MPC) is an establish...
4,Assessment of the relationship between central...,Akabane S.,Scientific Reports,2024-12-01,SCOPUS_ID:85207210995,0,True,2-s2.0-85207210995,Journal,"[{'name': 'The University of Tokyo Hospital', ...",https://www.scopus.com/inward/record.uri?partn...,Purpose The relationship between the height of...
...,...,...,...,...,...,...,...,...,...,...,...,...
1095,Proceedings - 2023 International Conference on...,,Proceedings - 2023 International Conference on...,2023-01-01,SCOPUS_ID:85190663820,0,False,2-s2.0-85190663820,Conference Proceeding,[],https://www.scopus.com/inward/record.uri?partn...,The proceedings contain 56 papers. The topics ...
1096,Early Diabetics Prognosis Prediction System Ad...,Gopi S.,Proceedings of the 2023 International Conferen...,2023-01-01,SCOPUS_ID:85190500840,0,False,2-s2.0-85190500840,Conference Proceeding,"[{'name': 'Panimalar Engineering College', 'ci...",https://www.scopus.com/inward/record.uri?partn...,Early diabetic prognosis is carried out for th...
1097,Data Science with Semantic Technologies: New T...,Patel A.,Data Science with Semantic Technologies: New T...,2023-01-01,SCOPUS_ID:85190492747,1,False,2-s2.0-85190492747,Book,[{'name': 'National Forensic Sciences Universi...,https://www.scopus.com/inward/record.uri?partn...,As data is an important asset for any organiza...
1098,Data Science with Semantic Technologies,Milli M.,Data Science with Semantic Technologies: New T...,2023-01-01,SCOPUS_ID:85190492612,0,False,2-s2.0-85190492612,Book,[{'name': 'Bolu Abant İzzet Baysal Üniversites...,https://www.scopus.com/inward/record.uri?partn...,"Today, with the development of digital technol..."


In [23]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [27]:
def clean_text(text):
    # Extract words (alphabetic characters only)
    words = re.findall(r'\b[a-zA-Z]+\b', text)
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]
    return filtered_words # Join words back into a string

# Apply the clean_text function to the 'title' and 'name' columns
df['cleaned_title'] = df['title'].apply(lambda x: clean_text(str(x)))
df['cleaned_publicationName'] = df['publicationName'].apply(lambda x: clean_text(str(x)))

# Show the updated DataFrame
print(df[['cleaned_title', 'cleaned_publicationName']])

                                         cleaned_title  \
0    [Data, Science, Supporting, Hotel, Management,...   
1    [Data, science, Analysis, Using, Deep, Learnin...   
2    [Data, science, based, reconstruction, D, memb...   
3    [Data, science, sustainable, entrepreneurship,...   
4    [Applying, data, science, methodologies, artif...   
..                                                 ...   
995  [Data, Science, literacy, future, security, in...   
996  [Predictive, modeling, heat, formation, sulfur...   
997  [Data, Science, Model, Predictive, Control, su...   
998  [Machine, learning, data, science, techniques,...   
999  [Harnessing, Data, Science, Produced, Water, E...   

                               cleaned_publicationName  
0           [Smart, Innovation, Systems, Technologies]  
1    [International, Conference, Knowledge, Enginee...  
2                         [Journal, Membrane, Science]  
3         [Technological, Forecasting, Social, Change]  
4               [A

In [86]:
output_filename = "abstract_draft1.csv"
df_cleaned.to_csv(output_filename, index=False)