# ArXiv Dataset Filtration for AI PDFs

In [1]:
import os
import json
import time
import re
from datetime import datetime

The ArXiv meta-dataset is downloaded from Kaggle: https://www.kaggle.com/datasets/Cornell-University/arxiv![image.png](attachment:image.png)<br>
date of download: 21 January 2024 

In [2]:
data = []
with open('arxiv-metadata-oai-snapshot.json','r') as file:
    for line in file:
        json_data = json.loads(line)
        data.append(json_data)

In [3]:
print('total number of papers in metadata: ',len(data))

total number of papers in metadata:  2403902


## Filter on math/cs/stat categorized papers 
We only want to focus on papers under  math/cs/stat categories

In [52]:
cate_data=[]
for paper in data:
    categories = paper.get('categories','').split()
    if any(category.startswith(("math", "cs", "stat")) for category in categories):
        cate_data.append(paper)

In [53]:
print('total number of papers in metadata: ',len(cate_data))

total number of papers in metadata:  1138099


## Extract target text
Here we want to use the title + abstract as the target text and perform keywords matching through Regex 

In [54]:
def extract_title_abstract(paper):
    combined = paper.get('title').replace('\n',' ') + '.' + paper.get('abstract').replace('\n',' ')
    return combined

## First round of general filtration
First round of general filtration, make sure those papers are all AI related or at least mentioned AI. 
* Used "\b" the word boundary here, because we want to match the exact terms here, not any substrings

In [71]:
AI_word = ['(A|a)rtificial (I|i)ntelligence',
            'AI',
            'LLMs?',
            '(L|l)arge (L|l)anguage (M|m)odels?',
            '(c|C)hat(gpt|GPT)',
            '(gpt|GPT)4',
            'NLP',
            '(n|N)atural (L|l)anguage',
            '(d|D)eep (L|l)earning',
            '(m|M)achine (L|l)earning']
temp = '|'.join(AI_word)
print(temp)

(A|a)rtificial (I|i)ntelligence|AI|LLMs?|(L|l)arge (L|l)anguage (M|m)odels?|(c|C)hat(gpt|GPT)|(gpt|GPT)4|NLP|(n|N)atural (L|l)anguage|(d|D)eep (L|l)earning|(m|M)achine (L|l)earning


In [72]:
pattern_whole = r'\b(?:{})\b'.format(temp)
# pattern_initial = r'\bAI\b'
general_filter = [] 

start_time = time.time()
for paper in cate_data:
    try:
        title_abstract = extract_title_abstract(paper)
        
        matches_whole = re.findall(pattern_whole, title_abstract,flags=re.NOFLAG)
        
        if matches_whole:
            #print(matches_whole)
            general_filter.append(paper)
            
        # else:
        #     matches_intial = re.findall(pattern_initial,title_abstract,flags=re.NOFLAG)
          #   if matches_intial:
            #     general_filter.append(paper)
        
    except Exception as e:
        print(f"An error occurred: {e}")

        
end_time = time.time()
time_spent = end_time-start_time
print(f'time spent: {time_spenth} seconds')
print('number of papers filtered: ',len(general_filter))

time spent: 62.0 seconds
number of papers filtered:  111783


## determine keywords for regex matching
Target keywords, selected from Mitre Atlas Attack Classes and some other commonly used ones

In [80]:
important_word = ['poison|poisoning|poisoned',
                    'evade|evasion',
                    'prompt injection',
                    'backdoor|backdoored',
                    'jailbreak',
                    'hijacking',
                    'extraction attacks?',
                    'privacy leakage',
                    'data leak|data leaking|data leakage',
                    'denial of service|DDOS',
                    'deepfake',
                    'adversarial examples?|adversarial attacks?|adversarial samples?',
                    'ransomwares?']

## Regex matching with detailed keywords
Run regex on each keyword individually to see the hit performance, in order to adjust quickly

In [81]:
offence_result=[]

for tech in important_word:
    num_hits = 0
    print('matching keyword: ',tech)
    
    pattern_offence = r'\b({})\b'.format(tech)
    print(pattern_offence)
    
    start_time = time.time()
    
    for paper in general_filter:
        try:
            title_abstract = extract_title_abstract(paper)

            matches_key = re.findall(pattern_offence, title_abstract,flags=re.IGNORECASE)

            if matches_key and paper not in offence_result:
                # print(paper.get('id'))
                # print(matches_key)
                #if tech == 'exfiltration':
                    # print(matches_key)
                num_hits += 1
                # print(matches_key)
                paper['matches_key'] = set([key.lower() for key in matches_key])
                offence_result.append(paper)

        except Exception as e:
            print(f"An error occurred: {e}")
    
    
    end_time = time.time()
    time_spent = end_time - start_time
    
    print('number of papers found: ',num_hits)
    print(f"time spent: {time_spent:.1f} seconds")
    print('='*50+'\n')
print('total number of papers found: ',len(offence_result))

matching keyword:  poison|poisoning|poisoned
\b(poison|poisoning|poisoned)\b
number of papers found:  519
time spent: 1.5 seconds

matching keyword:  evade|evasion
\b(evade|evasion)\b
number of papers found:  325
time spent: 1.4 seconds

matching keyword:  prompt injection
\b(prompt injection)\b
number of papers found:  19
time spent: 1.4 seconds

matching keyword:  backdoor|backdoored
\b(backdoor|backdoored)\b
number of papers found:  170
time spent: 1.4 seconds

matching keyword:  jailbreak
\b(jailbreak)\b
number of papers found:  43
time spent: 1.4 seconds

matching keyword:  hijacking
\b(hijacking)\b
number of papers found:  24
time spent: 1.4 seconds

matching keyword:  extraction attacks?
\b(extraction attacks?)\b
number of papers found:  57
time spent: 1.4 seconds

matching keyword:  privacy leakage
\b(privacy leakage)\b
number of papers found:  154
time spent: 1.5 seconds

matching keyword:  data leak|data leaking|data leakage
\b(data leak|data leaking|data leakage)\b
number of

Reformat the keyword list into a long regex expression, run the filtration to match the papers with any of them

adding "matches_key" to paper data for easily manual check

adding "url" to get direct link to PDF file

adding "date" to get the latest update date in datestamp format

In [82]:
offence_result=[]
tech_str = '|'.join(important_word)

pattern_offence = r'\b(?:{})\b'.format(tech_str)
# print(pattern_offence)

start_time = time.time()

for paper in general_filter:
    try:
        title_abstract = extract_title_abstract(paper)

        matches_key = re.findall(pattern_offence, title_abstract,flags=re.IGNORECASE)
        # matches_key = [k if k is not "" for k in matches_key]

        if matches_key:
            # print(paper.get('id'))
            # print(matches_key)
            #if tech == 'exfiltration':
                # print(matches_key)
            latest_version = max(version['created'] for version in paper['versions'])
            date_object = datetime.strptime(latest_version, "%a, %d %b %Y %H:%M:%S %Z")
            datestamp_format = date_object.strftime('%Y-%m-%d')
            
            paper_id = paper.get('id','')
            paper['matches_key'] = set([key.lower() for key in matches_key])
            paper['url'] = f'https://arxiv.org/pdf/{paper_id}' 
            paper['publish_date'] = datestamp_format
            #print(paper['matches_key'])
            offence_result.append(paper)

    except Exception as e:
        print(f"An error occurred: {e}")

end_time = time.time()
time_spent = end_time - start_time

print(f"time spent: {time_spent:.1f} seconds")
print('total number of papers found: ',len(offence_result))

time spent: 7.1 seconds
total number of papers found:  3682


## output the result

save to excel file for manual check

In [84]:
import pandas as pd

# Extracting id, title, and abstract from each paper
excel = {'id': [], 'title': [], 'abstract': [],'categories':[],'matched_keyword':[]}

for paper in offence_result:
    excel['id'].append(paper['id'])
    excel['title'].append(paper['title'])
    excel['abstract'].append(paper['abstract'].replace('\n',' '))
    excel['categories'].append(paper['categories'])
    excel['matched_keyword'].append(paper['matches_key'])
    
# Creating a DataFrame using pandas
df = pd.DataFrame(excel)

# Specifying the Excel file path
excel_file_path = 'filtered_papers_eventual.xlsx'

# Writing the DataFrame to an Excel file
df.to_excel(excel_file_path, index=False)

print(f'Data exported to {excel_file_path}')


Data exported to filtered_papers_eventual.xlsx


save to json file for uploading to openCTI (dropped "matches_key" since set cannot be in json file)

In [77]:
output_json = 'final_filter_results.json'

for paper in offence_result:
    paper.pop('matches_key', None)

with open(output_json,'w',encoding='utf-8') as output_file:
    json.dump(offence_result,output_file,indent=2)

print(f'Data exported to {output_json}')

Data exported to final_filter_results.json
