In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
bigdata = requests.get('https://arxiv.org/html/2401.00009v1')
soup = BeautifulSoup(bigdata.text, "lxml")

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get('https://arxiv.org/html/2401.00009v1', headers=headers)

abstract = soup.findAll("div",{"class":"ltx_abstract"})
keyword = soup.findAll("div",{"class":"ltx_keywords"})
author = soup.findAll("span",{"class":"ltx_contact ltx_role_affiliation"})

data = []

abstract_text = " ".join([re.sub(r"\bAbstract[\.\n]*", "", item.get_text()).strip() for item in abstract])
keyword_text = " ".join([item.get_text().replace("Keywords: ", "").strip() for item in keyword])
keywords_array = re.split(r'(?<!^)\s(?=[A-Z])', keyword_text)
author_text = " ".join([item.get_text() for item in author])

countries = [item.get_text().strip().split()[-1] for item in author]

data.append({'abstract':abstract_text, 'keyword':keywords_array, 'country':countries})

if "reCAPTCHA" in soup.get_text():
    print("Blocked by CAPTCHA. Exiting...")

# soup.get_text()
data

[{'abstract': 'In the wake of large language models, there has been a resurgence of claims and questions about the Turing test and its value for AI, which are reminiscent of decades of practical “Turing” tests. If AI were quantum physics, by now several “Schrödinger’s” cats could have been killed. Better late than never, it is time for a historical reconstruction of Turing’s beautiful thought experiment. In this paper I present a wealth of evidence, including new archival sources, give original answers to several open questions about Turing’s 1950 paper, and address the core question of the value of Turing’s test.',
  'keyword': ['Alan',
   'Turing,',
   'Turing test,',
   'Thought experiment,',
   'Foundations of',
   'AI & computer science,',
   'Galileo',
   'Galilei,',
   'History of science,',
   'History of',
   'AI'],
  'country': ['PauloBrazil']}]

In [18]:
data = []

# Start at January 2024 (2401)
month = 1
year = 24
paper_id = 7033  # Global paper ID
not_found_count = 0

while len(data) < 2000:
    # Format YYMM (e.g., "2401" for Jan 2024)
    yymm = f"{year:02}{month:02}"

    # Build URL with global paper_id
    url = f'https://arxiv.org/html/{yymm}.{paper_id:05}v1'
    print(f"The current paper id is {yymm}.{paper_id:05}")
    try:
        bigdata = requests.get(url)
        soup = BeautifulSoup(bigdata.text, "lxml")
        
        if "reCAPTCHA" in soup.get_text():
            print("Blocked by CAPTCHA. Exiting...")
            break

        # Check for "not found" page
        if soup.find("h1", string=re.compile(r"Article .* not found")):
            not_found_count += 1
            if not_found_count == 2:  # If two consecutive "not found," no more papers exist
                break
            paper_id += 1  # Increment paper_id globally
            continue

        # Reset not_found_count when a valid page is found
        not_found_count = 0

        # Extract fields
        abstract = soup.findAll("div",{"class":"ltx_abstract"})
        keyword = soup.findAll("div",{"class":"ltx_keywords"})
        author = soup.findAll("span",{"class":"ltx_contact ltx_role_affiliation"})

        # Process data
        abstract_text = " ".join([re.sub(r"\bAbstract[\.\n]*", "", item.get_text()).strip() for item in abstract])
        keyword_text = " ".join([item.get_text().replace("Keywords: ", "").strip() for item in keyword]).strip()
        keywords_array = re.split(r'(?<!^)\s(?=[A-Z])', keyword_text)
        author_text = " ".join([item.get_text() for item in author]).strip()
        countries = [item.get_text().strip().split()[-1] for item in author]

        # Skip if any key field is empty
        # if abstract_text and keyword_text and author_text:
        data.append({'abstract': abstract_text, 'keyword': keywords_array, 'country': countries})

        # Break if we reach 1000 entries
        if len(data) >= 1000:
            break

        paper_id += 1  # Increment paper_id globally
    except Exception as e:
        print(f"Error on paper {yymm}.{paper_id:05}: {e}")
        paper_id += 1  # Ensure paper_id increments even on error

    # Move to the next month if two consecutive "not found" occur and paper_id crosses into a new month
    if not_found_count == 2:
        month += 1
        not_found_count = 0
        if month == 12:  # If December is exceeded, break from the loop (exceed this year)
            break
    
    print(f"Scraped {len(data)} entries so far...")

    time.sleep(1)

print(f"Data collected: {len(data)} entries")


The current paper id is 2401.07033
Scraped 1 entries so far...
The current paper id is 2401.07034
Scraped 2 entries so far...
The current paper id is 2401.07035
Scraped 3 entries so far...
The current paper id is 2401.07036
Scraped 4 entries so far...
The current paper id is 2401.07037
Scraped 5 entries so far...
The current paper id is 2401.07038
Scraped 6 entries so far...
The current paper id is 2401.07039
Scraped 7 entries so far...
The current paper id is 2401.07040
Scraped 8 entries so far...
The current paper id is 2401.07041
Scraped 9 entries so far...
The current paper id is 2401.07042
Scraped 10 entries so far...
The current paper id is 2401.07043
Scraped 11 entries so far...
The current paper id is 2401.07044
Scraped 12 entries so far...
The current paper id is 2401.07045
Scraped 13 entries so far...
The current paper id is 2401.07046
Scraped 14 entries so far...
The current paper id is 2401.07047
Scraped 15 entries so far...
The current paper id is 2401.07048
Scraped 16 ent

In [19]:
df = pd.DataFrame(data)
df.to_csv('out8.csv', index=False)
df

Unnamed: 0,abstract,keyword,country
0,,[],[]
1,,[],[]
2,,[],[]
3,,[],[]
4,,[],[]
...,...,...,...
995,,[],[]
996,,[],[]
997,,[],[]
998,,[],[]
