In [23]:
import numpy as np
import pandas as pd
import os
import bibtexparser

In [None]:
with open('./anthology+abstracts.bib', 'r') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

# Extracting entries

entries = bib_database.entries

In [11]:
bibtex_str = """
@comment{
    This is my example comment.
}

@ARTICLE{Cesar2013,
  author = {Jean César},
  title = {An amazing title},
  year = {2013},
  volume = {12},
  pages = {12--23},
  journal = {Nice Journal}
}
"""

In [None]:
import bibtexparser
file_path = os.path.join("./", "anthology+abstracts.bib")
library = bibtexparser.parse_file(file_path)
entries = library.entries

In [20]:
entries[1]
# 2023.wsc-csdh.1/
# https://aclanthology.org/2023.wsc-csdh.1/

Entry(entry_type=`inproceedings`, key=`krishna-etal-2023-neural`, fields=`[Field(key=`title`, value=`Neural Approaches for Data Driven Dependency Parsing in {S}anskrit`, start_line=11), Field(key=`author`, value=`Krishna, Amrith  and
      Gupta, Ashim  and
      Garasangi, Deepak  and
      Sandhan, Jeevnesh  and
      Satuluri, Pavankumar  and
      Goyal, Pawan`, start_line=12), Field(key=`booktitle`, value=`Proceedings of the Computational {S}anskrit {\&} Digital Humanities: Selected papers presented at the 18th World {S}anskrit Conference`, start_line=18), Field(key=`month`, value=`jan`, start_line=19), Field(key=`year`, value=`2023`, start_line=20), Field(key=`address`, value=`Canberra, Australia (Online mode)`, start_line=21), Field(key=`publisher`, value=`Association for Computational Linguistics`, start_line=22), Field(key=`url`, value=`https://aclanthology.org/2023.wsc-csdh.1`, start_line=23), Field(key=`pages`, value=`1--20`, start_line=24)]`, start_line=10)

In [39]:
lst = str(entries[5]).split("`url` = ")[-1].split("`")
[ele for ele in lst if ele][0]

'https://aclanthology.org/2023.wsc-csdh.5'

### PDF to text

In [150]:
from PyPDF2 import PdfReader

# open the PDF file
reader = PdfReader('/Users/quert/Documents/GitHub/acl-research/papers/2023.acl-long.2.pdf', 'rb')

texts = []
for idx in range(len(reader.pages)):
    page = reader.pages[idx]
    extracted = page.extract_text().split("\n")
    texts.extend(extracted)
    
# find the idx of "Related Work", "Datsets"
id_related_work = []
for idx in range(len(texts)):
    list_ele = None
    try:
        if ['Related', 'Work'] == texts[idx].split()[-2:]:
            id_related_work.append(idx)
    except: pass
# find the idx of "Abstract"

id_abstract = []
for idx in range(len(texts)):
    try:
        if ['Abstract'] == texts[idx].split()[-2:]:
            id_abstract.append(idx)
    except: pass

# find the idx of "Introduction"

id_intro = []
for idx in range(len(texts)):
    try:
        if ['Introduction'] == texts[idx].split()[-1:]:
            id_intro.append(idx)
    except: pass

# find the idx of "Conclusion"

id_conclusion = []
for idx in range(len(texts)):
    try:
        if ['Conclusion'] == texts[idx].split()[-1:] or "Conclusion" in texts[idx].split()[-4:]:
            id_conclusion.append(idx)
    except: pass

# find the idx of "Reference"
id_reference = []
for idx in range(len(texts)):
    try:
        if ['Reference'] == texts[idx].split()[-1]:
            id_reference.append(idx)
    except: pass


In [151]:
id_abstract, id_intro, id_related_work, id_conclusion, id_reference

([9], [43], [165], [754], [])

In [None]:
# Select the top-N articles from "Related Work"
from refextract import extract_references_from_file
references = extract_references_from_file('/Users/quert/Documents/GitHub/acl-research/papers/2023.acl-long.2.pdf')
print(references[0])


### Scrape texts from ACL site

In [141]:
import requests
from bs4 import BeautifulSoup

urls = [f"https://aclanthology.org/2023.acl-long.{idx}" for idx in range(1, 41)]

abstract_texts = []
error_ids = []


headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
r = requests.get(url=url, params={'param':'1'}, headers={'Connection':'close'})



for idx in range(len(urls)):
    response = requests.get(url=urls[idx], params={'param':'1'}, headers={'Connection':'close'})
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            abstract_tag = soup.find('div', {'class': 'acl-abstract'})
            abstract_text = str(abstract_tag).split("<span>")[1]
            abstract_texts.append(abstract_text)
    except:
        error_ids.append(idx)
abstract_texts = [abstract_texts[idx].rstrip("</span></div>") for idx in range(len(abstract_texts))]

### Analyze data from ACL OCL

In [4]:
import pandas as pd
dataset = pd.read_csv("./ACT2_dataset.tsv", sep="\t", index_col="unique_id")

In [48]:
dataset.columns

# citing abstract with id=7
# cited abstract with id=10

Index(['core_id', 'citation_offset', 'total_doc_length', 'section_info',
       'citing_title', 'citing_author', 'citing_publication_info',
       'citing_abstract', 'cited_title', 'cited_author', 'cited_abstract',
       'cited_doi', 'cited_publication_date', 'cited_publication_info',
       'citation_context', 'self_citation', 'direct_citations', 'co_mentions',
       'citation_class_label', 'citation_influence_label'],
      dtype='object')

In [77]:
cited_abstracts = dataset["cited_abstract"].to_list()
citing_abstracts = dataset["citing_abstract"].to_list()

cleaned_cited_abstracts, cleaned_citing_abstracts, cleaned_id = [], [], []
for i in range(4000):
    if str(cited_abstracts[i]) != "nan" and str(citing_abstracts[i]) != "nan":
        cleaned_cited_abstracts.append(cited_abstracts[i])
        cleaned_citing_abstracts.append(citing_abstracts[i])
        cleaned_id.append(i)
        
        
len(cleaned_cited_abstracts), len(cleaned_citing_abstracts)

(3032, 3032)

In [281]:
citation_contexts = dataset["citation_context"].tolist()
extracted_citation_contexts = []
for i in cleaned_id:
    citation_context = citation_contexts[i]
    extracted_citation_contexts.append(citation_context)

In [269]:
cleaned_citing_abstracts[0], cleaned_cited_abstracts[0]

("As part of an interdisciplinary project on the environmental history of the Viennese Danube, the past river landscape was reconstructed. This article describes the different types of historical sources used for the GIS-based reconstruction, the underlying methodological approach and its limitations regarding reliability and information value. The reconstruction was based on three cornerstones: (1) the available historical sources; (2) knowledge about morphological processes typical for the Austrian Danube prior to regulation; and (3) the interpretation of past hydraulic measures with respect to their effectiveness and their impact on the river's behaviour. We compiled ten historical states of the riverscape step-by-step going backwards in time to the early 16th century. After one historical situation had been completed, we evaluated its relevance for the temporally younger situations and whether corrections would have to be made. Such a regressive-iterative approach allows for perman

In [238]:
import openai
from transformers import GPT2TokenizerFast

def call_gpt(new_abstract, old_abstract):
    openai.api_key = "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"
    inputs_for_gpt = f"""
    I have two versions of abstracts from two papers, newer and older ones:
    ### Newer Version
    {new_abstract}

    ### Older Vesion
    {old_abstract}
    After reviewing the provided abstracts, please identify any improvements or advancements made in the newer paper compared to the older paper ,and highlight any new methods or techniques proposed in the newer paper that were not present in the older paper.
"""
    
    # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    # length = len(tokenizer(inputs_for_gpt)["input_ids"])
    inputs_for_gpt = " ".join(inputs_for_gpt.split()[:3000])
        
    completion = openai.ChatCompletion.create(
         model = "gpt-3.5-turbo",
         messages = [
             {"role": "user", "content": inputs_for_gpt}
         ]
     )
    response = completion.choices[0].message.content
    # if "<"+response.split("<")[-1].strip() == "<"+paragraph.split("<")[-1].strip(): response = response 
    # else: response = response + " <"+paragraph.split("<")[-1].strip()
    return str(response)

In [None]:
import time
responses = []
count = 0
for idx in range(2950, 3000): # 0-199t
    response = call_gpt(cleaned_citing_abstracts[idx], cleaned_cited_abstracts[idx])
    responses.append(response)
    count += 1
    if count%5==0: time.sleep(60)
pd.DataFrame({"responses": responses}).to_csv("./gap_2950_2999.csv")


In [247]:
pd.DataFrame({"citing_abstracts": cleaned_citing_abstracts, "cited_abstracts": cleaned_cited_abstracts}).to_csv("./data/combined_abstracts_wo_res.csv")

In [260]:
len(dataset)

4000

In [278]:
len(cleaned_cited_abstracts), len(set(cleaned_cited_abstracts))

(3032, 2939)

### Add citation contexts to dataset

In [283]:
data_w_meta = pd.read_csv("/Users/quert/Documents/GitHub/acl-research/data/abstract_w_meta.csv")

In [287]:
citing_abstracts = data_w_meta["citing_abstracts"].tolist()
cited_abstracts = data_w_meta["cited_abstracts"].tolist()
meta_info = data_w_meta["meta"].tolist()

In [288]:
pd.DataFrame({"citing_abstracts": citing_abstracts, "cited_abstracts": cited_abstracts, "meta": meta_info, "citation_contexts": extracted_citation_contexts}).to_csv("/Users/quert/Documents/GitHub/acl-research/data/abstract_w_all.csv")

In [289]:
data_w_all = pd.read_csv("/Users/quert/Documents/GitHub/acl-research/data/abstract_w_all.csv")
len(data_w_all)

3032

In [290]:
data_w_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3032 entries, 0 to 3031
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         3032 non-null   int64 
 1   citing_abstracts   3032 non-null   object
 2   cited_abstracts    3032 non-null   object
 3   meta               3032 non-null   object
 4   citation_contexts  3032 non-null   object
dtypes: int64(1), object(4)
memory usage: 118.6+ KB
