# LLMsecurity website AI-related urls scraping

Since this is a site listing links to large language model security content - research, papers, and news collected and posted by @llm_sec, so won't do any filtration, but just getting all the useful information

In [6]:
import requests
import re
import json
from collections import Counter
from bs4 import BeautifulSoup
from datetime import datetime

In [7]:
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)
    return [link['href'] for link in links]

From the above occurrence count, just implement code for arxiv and aclanthology website, and munaully go through the rest of them:

### scraping for ArXiv papers

In [8]:
def arxiv_scrape(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    paper_list = []
    
    # Extract id
    paper_id = soup.find('span', class_='arxivid').a.text.split(':')[1]
    #print(paper_id)
    
    # Extract title
    title = soup.find('h1', class_='title mathjax').text.replace('Title:', '').strip()
    #print(title)

    # Extract authors
    authors_tag = soup.find('div',class_='authors')
    authors = ', '.join([author.get_text() for author in authors_tag.find_all('a')])
    #print(authors)

    # Extract abstract
    abstract = soup.find('blockquote',class_='abstract mathjax').text.replace('Abstract:', '').strip()
    #print(abstract)
    
    pdf = url.replace('abs','pdf')+'.pdf'
    #print(pdf)
    
    # Extract date
    date_tag = soup.find('div', class_='dateline').text.replace(']','').strip().split(' ')[2:5]
    date = datetime.strptime(' '.join(date_tag), '%d %b %Y').strftime('%Y-%m-%d')
    #print(date)
    
    # Extract categories
    cate_tag = soup.find('td', class_='tablecell subjects').text.strip()
    cate = ', '.join(str(element.split('(')[1])[:-1] for element in cate_tag.split(';'))
    #print(cate)
    
    paper_list.append({
        "domain": 'arxiv',
        "id": paper_id,
        "title": title,
        "authors": authors,
        "abstract": abstract,
        'categories': cate,
        'url': pdf,
        "publish_date": date

    })
    return paper_list

### scraping for Aclanthology papers

In [9]:
def acl_scrape(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    paper_list = []
    
    # Extract id
    paper_id = url.split('/')[-2]
    #print(paper_id)
    
    # Extract title
    title = soup.find('h2', id='title').a.text.strip()
    #print(title)

    # Extract authors
    authors_tag = soup.find('p',class_='lead')
    authors = ', '.join([author.get_text() for author in authors_tag.find_all('a')])
    #print(authors)

    # Extract abstract
    abstract = soup.find('div',class_='card-body acl-abstract').span.text.strip()
    #print(abstract)
    
    pdf = url[:-1]+'.pdf'
    #print(pdf)
    
    # Extract date
    year = soup.find('dt', string='Year:').find_next('dd').text.strip()
    month = soup.find('dt', string='Month:').find_next('dd').text.strip()
    month_number = datetime.strptime(month, '%B').month
    date_object = datetime(int(year), month_number, 1)
    date = date_object.strftime('%Y-%m')
    #print(date)
    
    paper_list.append({
        "domain": 'aclanthology',
        "id": paper_id,
        "title": title,
        "authors": authors,
        "abstract": abstract,
        'url': pdf,
        "publish_date": date

    })
    return paper_list

### manual to do

In [10]:
def manual_scrape(url,domain):

    paper_list = []
    
    paper_list.append({
        "domain": domain,
        "title": '',
        "authors": '',
        "abstract":'',
        'url': url,
        "publish_date": ''
    })
    return paper_list

### Scrape all the urls from the site
then perform further extraction based on different websites

Extract the first word after "http://" to see the distribution of those links' domains, then decide to do code scraping or simply manual go through

In [11]:
domain=[]

pattern = r'https://([^\/]+)'
for link in links:
    match = re.search(pattern, link)
    if match:
        if match.group(1).split(".")[0] == 'www':
            domain.append(match.group(1).split(".")[1])
        else:
            domain.append(match.group(1).split(".")[0])
        
domain_count = Counter(domain)

# Display count distribution
for domain, count in domain_count.items():
    print(f"{domain}: {count} occurrences")

NameError: name 'links' is not defined

In [12]:
base_url = 'https://llmsecurity.net'
raw_links = get_links(base_url)
links = raw_links[7:-16]

paper = []
pattern = r'https://([^\/]+)'

for link in links:
    match = re.search(pattern, link)
    if match.group(1).split(".")[0] == 'www':
        domain = match.group(1).split(".")[1]
    else:
        domain = match.group(1).split(".")[0]
    
    print(f'scraping infomation on {link}')
    if link[-3:] == 'pdf':
        paper.append(manual_scrape(link,domain))
    elif domain == 'arxiv':
        paper.append(arxiv_scrape(link))
    elif domain == 'aclanthology':
        paper.append(acl_scrape(link))
    else:
        paper.append(manual_scrape(link,domain))
    print('='*25)

scraping infomation on https://arxiv.org/abs/2307.15008
scraping infomation on https://ceur-ws.org/Vol-3462/TADA4.pdf
scraping infomation on https://arxiv.org/abs/2305.14950
scraping infomation on https://arxiv.org/abs/1905.02175
scraping infomation on https://www.youtube.com/watch?v=uqOfC3KSZFc
scraping infomation on https://arxiv.org/abs/2106.09898
scraping infomation on https://arxiv.org/abs/2109.11308
scraping infomation on https://aclanthology.org/2023.trustnlp-1.24/
scraping infomation on https://arxiv.org/abs/2310.03693
scraping infomation on https://arxiv.org/abs/2104.13733
scraping infomation on https://arxiv.org/abs/2307.12507
scraping infomation on https://aclanthology.org/2023.trustnlp-1.9/
scraping infomation on https://arxiv.org/abs/2307.15043
scraping infomation on https://arxiv.org/abs/2210.10683
scraping infomation on https://arxiv.org/abs/1905.12457
scraping infomation on https://aclanthology.org/2023.acl-long.194/
scraping infomation on https://arxiv.org/abs/2305.100

scraping infomation on https://blog.trailofbits.com/2022/10/03/semgrep-maching-learning-static-analysis/
scraping infomation on https://openreview.net/forum?id=BxZgduuNDl5
scraping infomation on https://arxiv.org/abs/2307.16630
scraping infomation on https://www.ncsc.gov.uk/blog-post/thinking-about-security-ai-systems
scraping infomation on https://www.amazon.science/publications/towards-building-a-robust-toxicity-predictor
scraping infomation on https://aclanthology.org/2023.findings-acl.561/
scraping infomation on https://arxiv.org/abs/2309.01669
scraping infomation on https://aclanthology.org/2022.findings-naacl.137/
scraping infomation on https://aclanthology.org/2023.trustnlp-1.21/
scraping infomation on https://aclanthology.org/2023.trustnlp-1.25/
scraping infomation on https://aclanthology.org/2023.findings-acl.237/
scraping infomation on https://arxiv.org/abs/2011.10369
scraping infomation on https://arxiv.org/abs/2308.02122
scraping infomation on https://arxiv.org/abs/2309.162

### output as json file

In [10]:
json_data = json.dumps(paper, indent=2)

# Write JSON data to a file
with open('llmsecurity_scraped_metadata.json', 'w') as json_file:
    json_file.write(json_data)

print("Data has been stored in 'llmsecurity_scraped_metadata.json'.")

Data has been stored in 'llmsecurity_scraped_metadata.json'.


### output text url file

In [24]:
#print(paper)
ans= []
for item in paper:
    url = item[0]['url']
    if ('https' in url) and not ('youtube' in url or 'youtu.be' in url):
        #print(url)
        ans.append(url)
        
with open('llmsecurity_net.txt', 'w') as file:
    for link in ans:
        file.write(link+'\n')