In [1]:
import re
import json
import requests

from sys import exit
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
# Define request headers
REQ_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}


BASE_URL = "https://arxiv.org"

# Define the base URL for scraping programming language from arxiv catchup from 1st Nov 2024
SCRAPE_START_URL = "https://www.arxiv.org/catchup/cs.PL/2024-11-01?abs=True"


In [3]:
response = requests.get(SCRAPE_START_URL, headers=REQ_HEADERS)

# Check if the request was successful
if response.status_code == 200:
    page_content = response.content
    soup = BeautifulSoup(page_content, 'html.parser')
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")
    page_content = None
    exit()


In [4]:
print(soup.head.prettify())

<head>
 <title>
 </title>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <link href="/static/browse/0.3.4/images/icons/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
 <link href="/static/browse/0.3.4/images/icons/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
 <link href="/static/browse/0.3.4/images/icons/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
 <link href="/static/browse/0.3.4/images/icons/site.webmanifest" rel="manifest"/>
 <link color="#5bbad5" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" rel="mask-icon"/>
 <meta content="#da532c" name="msapplication-TileColor"/>
 <meta content="#ffffff" name="theme-color"/>
 <link href="/static/browse/0.3.4/css/arXiv.css?v=20241206" media="screen" rel="stylesheet" type="text/css">
  <link href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" media="print" rel="stylesheet" type="text/css">
   <link href="/static/browse/0.3.4/css/browse_search.css" 

In [5]:
page_meta_data = {}
# Find the div containing the text "Total of"
div_with_entries = soup.find(string=re.compile(r'Total of'))

total_entries_on_this_page = 0
current_page_date = ""
current_page_formatted_date = ""
# Extract the text from the relevant div
if div_with_entries:
    total_entries_text = div_with_entries.get_text()

    # Use regular expressions to extract the total number of entries
    entries_match = re.search(r'Total of (\d+) entries', total_entries_text)
    total_entries_on_this_page = int(entries_match.group(1)) if entries_match else 0

    current_page_date = total_entries_text.split('entries for')[1].strip()

    date_obj = datetime.strptime(current_page_date, '%a, %d %b %Y')
    current_page_formatted_date = date_obj.strftime('%Y-%m-%d')


next_page_url = None
next_page_a_tag = soup.find('a', string='Continue to the next day')
if next_page_a_tag:
    next_page_url = BASE_URL + next_page_a_tag["href"]


page_meta_data = {
    "total_articles": total_entries_on_this_page,
    "date_raw": current_page_date,
    "date_clean": current_page_formatted_date,
    "next_page_url": next_page_url
}

In [6]:
page_meta_data

{'total_articles': 9,
 'date_raw': 'Fri, 01 Nov 2024',
 'date_clean': '2024-11-01',
 'next_page_url': 'https://arxiv.org/catchup/cs.PL/2024-11-04?abs=True&page=1'}

In [7]:
# Initialize a list to hold the extracted data
papers = []
categories_meta_data = {
    "categories": []
}

# Find all entries in the page
articles = soup.find_all("dl", id='articles')

for entry in articles:
    
    dds = entry.find_all('dd') # meta
    dts = entry.find_all('dt') # links

    category = ""
    category_tag = entry.find('h3')
    if category_tag:
        category = category_tag.text.split('(')[0].strip()
        entries_match = re.search(r'of (\d+) entries', category_tag.text)
        total_articles_of_category = int(entries_match.group(1)) if entries_match else 0
        categories_meta_data["categories"].append(category)
        categories_meta_data[category] = total_articles_of_category

    for index, dd in enumerate(dds):
        paper = {}
        dt = dts[index]
        paper['category'] = category
        
        # Extract the title
        title_tag = dd.find('div', class_='list-title mathjax')
        if title_tag:
            paper['title'] = title_tag.text.replace('Title:', '').strip()
        
        # Extract the authors
        authors_tag = dd.find('div', class_='list-authors')
        if authors_tag:
            paper['authors'] = authors_tag.text.replace('Authors:', '').strip()
    
        # Extract the comments
        comments_tag = dd.find('div', class_='list-comments mathjax')
        if comments_tag:
            paper['comments'] = comments_tag.text.replace('Comments:', '').strip()
    
        # Extract the journal refrences
        journal_ref_tag = dd.find('div', class_='list-journal-ref')
        if journal_ref_tag:
            paper['journal_refrence'] = journal_ref_tag.text.replace('Journal-ref:', '').strip()
        
        # Extract the description
        description_tag = dd.find('p', class_='mathjax')
        if description_tag:
            paper['description'] = description_tag.text.strip()
        
        # Extract the subjects
        subjects_tag = dd.find('div', class_='list-subjects')
        if subjects_tag:
            paper['subjects'] = subjects_tag.text.replace('Subjects:', '').strip()
    
        # Extract the abstract name and link
        abs_tag = dt.find('a', title='Abstract')
        if abs_tag:
            paper['abs_name'] = abs_tag.text.strip()
            paper['abs_link'] = BASE_URL + abs_tag['href']
        
        # Extract the PDF link
        pdf_tag = dt.find('a', title='Download PDF')
        if pdf_tag:
            paper['pdf_link'] = BASE_URL + pdf_tag['href']
        
        # Add the paper to the list
        papers.append(paper)


In [8]:
categories_meta_data

{'categories': ['New submissions',
  'Cross submissions',
  'Replacement submissions'],
 'New submissions': 1,
 'Cross submissions': 3,
 'Replacement submissions': 5}

In [9]:
len(papers)

9

In [10]:
papers[0]

{'category': 'New submissions',
 'title': 'Abstract Continuation Semantics for Multiparty Interactions in Process Calculi based on CCS',
 'authors': 'Eneia Nicolae Todoran (Dept. of Computer Science, Technical University of Cluj-Napoca), Gabriel Ciobanu (Academia Europaea)',
 'comments': 'In Proceedings FROM 2024, arXiv:2410.23020',
 'journal_refrence': 'EPTCS 410, 2024, pp. 18-37',
 'description': 'We develop denotational and operational semantics designed with continuations for process calculi based on CCS extended with mechanisms offering support for multiparty interactions. We investigate the abstractness of this continuation semantics. We show that our continuation-based denotational models are weakly abstract with respect to the corresponding operational models.',
 'subjects': 'Programming Languages (cs.PL); Distributed, Parallel, and Cluster Computing (cs.DC); Multiagent Systems (cs.MA)',
 'abs_name': 'arXiv:2410.23761',
 'abs_link': 'https://arxiv.org/abs/2410.23761',
 'pdf_lin

In [11]:
full_page_data = {
    "page_metadata": page_meta_data,
    "page_categories_meta_data": categories_meta_data,
    "page_articles": papers
}


In [12]:
full_page_data

{'page_metadata': {'total_articles': 9,
  'date_raw': 'Fri, 01 Nov 2024',
  'date_clean': '2024-11-01',
  'next_page_url': 'https://arxiv.org/catchup/cs.PL/2024-11-04?abs=True&page=1'},
 'page_categories_meta_data': {'categories': ['New submissions',
   'Cross submissions',
   'Replacement submissions'],
  'New submissions': 1,
  'Cross submissions': 3,
  'Replacement submissions': 5},
 'page_articles': [{'category': 'New submissions',
   'title': 'Abstract Continuation Semantics for Multiparty Interactions in Process Calculi based on CCS',
   'authors': 'Eneia Nicolae Todoran (Dept. of Computer Science, Technical University of Cluj-Napoca), Gabriel Ciobanu (Academia Europaea)',
   'comments': 'In Proceedings FROM 2024, arXiv:2410.23020',
   'journal_refrence': 'EPTCS 410, 2024, pp. 18-37',
   'description': 'We develop denotational and operational semantics designed with continuations for process calculi based on CCS extended with mechanisms offering support for multiparty interaction

In [13]:
with open(f'arxiv_programming_catchup_data_{current_page_formatted_date}.json', 'w') as f:
    f.write(json.dumps(full_page_data, indent=4))