# Retrieve papers from the ACL Anthology

## Install the package

In [2]:
# ! pip install acl-anthology-py

## Create the dataframe of the papers containing the keywords 

In [3]:
from acl_anthology import Anthology
from datetime import datetime
from tqdm import tqdm
import re
import pandas as pd 

n_year = 4

# Filter out the workshop papers 
bool_filter_ws = True

# Define the list of keywords
# keywords = ["stance", "argument"]  # Add more keywords as needed
keywords = ["stance"]

# Create a single regex pattern that matches any of the keywords
pattern = re.compile(r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b', re.IGNORECASE)

# Initialize the Anthology
anthology = Anthology.from_repo()

# Get the current year
current_year = datetime.now().year

# Search for papers with a keyword in the title or abstract, from the last n_year
last_n_years = list(range(current_year - n_year+1, current_year + 1))  # last n_year

In [32]:
matching_papers = [('id', 'title', 'year', 'abstract', 'venues', 'url')] 
list_mains = ['cl-', 'emnlp', 'coling', 'lrec', 'ijcnlp', 'findings']

for paper in tqdm(anthology.papers()):
    # Filter by publication year and check for the keyword
    title = str(paper.title).lower()
    abstract = str(paper.abstract).lower()
    
    in_title = re.search(pattern, title)
    in_abstract = re.search(pattern, abstract)
    
    year = str(paper.year)
    
    if int(paper.year) in last_n_years and \
       (in_title or in_abstract):
        
        list_venues = [event.id for event in paper.get_events()]
        
        # Filter out workshop papers to keep only the main tracks 
        url = paper.pdf.url.split('/')[-1]
        not_ws = not bool_filter_ws or ((f'ws-{year}' not in list_venues) and sum([event in url for event in list_mains]))
        # not_ws = not bool_filter_ws or ((f'ws-{year}' not in list_venues))

        if not_ws:
            matching_papers.append((paper.id, title, year, abstract, list_venues, url))

100470it [00:02, 47402.91it/s]


In [33]:
df = pd.DataFrame(matching_papers[1:], columns=matching_papers[0])
df

Unnamed: 0,id,title,year,abstract,venues,url
0,126,syntopical graphs for computational argumentat...,2021,approaches to computational argumentation task...,"[ijcnlp-2021, acl-2021]",2021.acl-long.126.pdf
1,127,stance detection in covid-19 tweets,2021,the prevalence of the covid-19 pandemic in day...,"[ijcnlp-2021, acl-2021]",2021.acl-long.127.pdf
2,128,topic-aware evidence reasoning and stance-awar...,2021,fact verification is a challenging task that r...,"[ijcnlp-2021, acl-2021]",2021.acl-long.128.pdf
3,1,advances in debating technologies: building ai...,2021,"the tutorial focuses on debating technologies,...","[ijcnlp-2021, acl-2021]",2021.acl-tutorials.1.pdf
4,147,learning from revisions: quality assessment of...,2021,assessing the quality of arguments and of the ...,[eacl-2021],2021.eacl-main.147.pdf
...,...,...,...,...,...,...
136,1385,the impact of stance object type on the qualit...,2024,stance as an expression of an author’s standpo...,"[lrec-2024, coling-2024]",2024.lrec-main.1385.pdf
137,1396,the role of creaky voice in turn taking and th...,2024,recent advancements in spontaneous text-to-spe...,"[lrec-2024, coling-2024]",2024.lrec-main.1396.pdf
138,119,p3sum: preserving author’s perspective in news...,2024,"in this work, we take a first step towards des...",[naacl-2024],2024.naacl-long.119.pdf
139,293,emona: event-level moral opinions in news arti...,2024,most previous research on moral frames has foc...,[naacl-2024],2024.naacl-long.293.pdf


In [35]:
df.to_csv('-'.join(keywords)+f'-last{n_year}years.csv', sep=',')

## Others 

### Useful if creating a bibtex for Mendeley

You can change the code if you need... 

In [36]:
paper.to_bibtex(with_abstract=False)

'@inproceedings{chancharoen-etal-1999-pattern,\n    title = "Pattern-based Machine Translation for {E}nglish-{T}hai",\n    author = "Chancharoen, Kaewchai  and\n      Tannin, Nisanad  and\n      Sirinaovakul, Booncharoen",\n    editor = "Wang, Jhing-Fa  and\n      Wu, Chung-Hsien",\n    booktitle = "Proceedings of the 13th Pacific Asia Conference on Language, Information and Computation",\n    month = feb,\n    year = "1999",\n    address = "National Cheng Kung University, Taiwan, R.O.C.",\n    publisher = "National Cheng Kung University, Taiwan, R.O.C.",\n    url = "https://aclanthology.org/Y99-1036/",\n    doi = "http://hdl.handle.net/2065/12135",\n    pages = "329--336"\n}'