In [1]:
import requests
import json
import pickle
import xml.etree.ElementTree as ET
import os

def write_facet_results(facet_results, file_name):
    # check whether file exists
    if os.path.exists(file_name):
        print("File already exists. Overwrite? (y/n)")
        if input() != 'y':
            return
    with open(file_name, 'w') as output_file:
        for d in facet_results:
            output_file.write(json.dumps(d)+'\n')


def read_facet_results(file_name):
    with open(file_name, "r") as f:
        lines = f.readlines()
    facet_results = []
    for line in lines:
        facet_results.append(json.loads(line))
    return facet_results


## Query Categories for All Papers

In [3]:

i = 5
inp_path = '../arxiv-dataset/train' + str(i) +  '.txt'
with open(inp_path, "r") as f:
    lines = f.readlines()
len(lines)


3037

In [92]:
import time

def query_domain(arxiv_id):
    # arxiv_id = "1810.04805"
    url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
    response = requests.get(url)
    root = ET.fromstring(response.text)
    categories = [cat.attrib['term'] for cat in root.iter('{http://www.w3.org/2005/Atom}category')]
    return categories
only_cat = []
for line in lines[2002:]:
    paper = json.loads(line)
    time.sleep(0.01)
    paper['categories'] = query_domain(paper["article_id"])
    only_cat.append(paper['categories'])
ids_for_cats = { 'cs': [], 'eco': [], 'eess': [], 'math': [], 'physics': [], 'q-bio': [], 'q-fin': [], 'stat': []}
for i, cat in enumerate(only_cat):
    if ' '.join(cat).startswith('cs'):
        ids_for_cats['cs'].append(i)
    if ' '.join(cat).startswith('eco'):
        ids_for_cats['eco'].append(i)
    if ' '.join(cat).startswith('eess'):
        ids_for_cats['eess'].append(i)
    if ' '.join(cat).startswith('math'):
        ids_for_cats['math'].append(i)
    if ' '.join(cat).startswith('physics'):
        ids_for_cats['physics'].append(i)
    if ' '.join(cat).startswith('q-bio'):
        ids_for_cats['q-bio'].append(i)
    if ' '.join(cat).startswith('q-fin'):
        ids_for_cats['q-fin'].append(i)
    if ' '.join(cat).startswith('stat'):
        ids_for_cats['stat'].append(i)

with open('ids_for_cats.pickle', 'rb') as handle:
    ids_for_cats2 = pickle.load(handle)
length1 = len([ele for lst in ids_for_cats2.values() for ele in lst ])
length2 = len([ele for lst in ids_for_cats.values() for ele in lst ])
for k in  ids_for_cats2:
    ids_for_cats2[k] = ids_for_cats2[k] + ids_for_cats[k]
length3 = len([ele for lst in ids_for_cats2.values() for ele in lst ])
assert length3 == length1 + length2

with open('ids_for_cats.pickle', 'wb') as handle:
    pickle.dump(ids_for_cats2, handle, protocol=pickle.HIGHEST_PROTOCOL)

len(only_cat)

1035

In [93]:
total = 0
for domain, idx in ids_for_cats2.items():
    total += len(idx)
    print(domain, len(idx))
print(total)

cs 88
eco 0
eess 0
math 472
physics 124
q-bio 19
q-fin 7
stat 21
731


## Annotate: query gpt3.5-turbo to categorize each sentence in the abstract into facets

In [4]:
i = 5
inp_path = '../arxiv-dataset/train' + str(i) +  '.txt'
only_cat = []
with open(inp_path, "r") as f:
    lines = f.readlines()

papers = []
for line in lines:
    papers.append(json.loads(line))


In [5]:
# aggregate abstracts for CS papers
with open('ids_for_cats.pickle', 'rb') as handle:
    ids_for_cats2 = pickle.load(handle)

abstracts = {}
for i in ids_for_cats2['cs']: 
    abstracts[papers[i]['article_id']] = papers[i]['abstract_text']

In [6]:
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

## Prompts for Single-sentence Classification

In [None]:

prefix_prompt = """
You are a classifier to label the text between the tag <S> and </S> into one of the four classes: background, methods, results, values. If you are uncertain, output "others".  You can only output one word from labels or "others".
"""

facet_results = []

for id, sentences  in list(abstracts.items())[:30]: 
    
    for sent in sentences:
        if "* keywords" in sent: 
            continue    # ignore keywords
        # message = prefix_prompt + sent
        output = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": prefix_prompt},
                {"role": "user", "content": sent}
            ]
        )
        facet_label = output['choices'][0]['message']['content']
        facet_results.append({
            "article_id": id, 
            "sent": sent, 
            'gpt-annotation': facet_label, 
            "prompt_tokens": output.get("usage").get('prompt_tokens'),
            "completion_tokens": output.get("usage").get('completion_tokens'),
            "total_tokens": output.get("usage").get('total_tokens')}
        )


## Prompts for Tagging

In [8]:
print(len(abstracts))

87


In [38]:
prompt = """
Task: You are a tagger to label each subtext between the tags <NUM> and </NUM> with one of four labels, where NUM is a placeholder for the numeric index of a sentence:

The labels are:
1. Background: This sentence provides context and motivation for the research by discussing previous work and highlighting the need for further investigation or a statement of the research problem.
2. Method: The setence outlines the approach used to carry out the research, including the experimental design, data collection and analysis methods, and any algorithms or models used.
3. Result: The sentence presents the findings of the research, quantitative or qualitative analysis or the main outcomes.
4. Value: The sentence discusses the broader impact and societal implications of the paper.
5. Others: If you are uncertain which class the sentence belongs to, label it as "others". 

Output includes NUM-label pairs separated by a semicolon and give only one label to each NUM, e.g., "1 Others; 2 Method".

Here is the input for tagging:

"""
# Example Input: "<0> matrix data sets are common nowadays like in biomedical imaging </0> <1> we design a fast and easy - to - implement iterative algorithm to approximate arbitrarily finely these extremal matrices</1>"
# Example Output: "1 Others; 2 Method"

facet_results = []
total_tokens = 0
for id, sentences  in list(abstracts.items())[:14]: 
    # remove <S> and </S> tags
    tagged_sentences = [sent.replace('<S>', '').replace('</S>', '') for sent in sentences]

    # add <NUM> and </NUM> tags for each sentence where NUM is 1, 2, 3, ...
    tagged_sentences = ' '.join([f" <{i}> {sent.strip()} </{i}> " for i, sent in enumerate(tagged_sentences) if "* keywords" not in sent] )
    
    
    specific_prompt = prompt + tagged_sentences
    

   
    output = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": specific_prompt}
        ]
    )
    facet_label = output['choices'][0]['message']['content']

    facet_results.append({
        "article_id": id, 
        "sent": tagged_sentences, 
        'gpt_annotation': facet_label, 
        "prompt_tokens": output.get("usage").get('prompt_tokens'),
        "completion_tokens": output.get("usage").get('completion_tokens'),
        "total_tokens": output.get("usage").get('total_tokens')}
    )
    total_tokens += output.get("usage").get('total_tokens')
print("Total tokens used: ", total_tokens)

In [50]:
facet_results = read_facet_results('cs5_abstract-tag.json')
num_papers = len(facet_results)
with_background = 0
with_method = 0
with_value = 0
with_result = 0
total_tokens = 0
for result in facet_results:
    total_tokens += result['total_tokens']
    all_facets = result['gpt_annotation'].lower()
    print(result['gpt_annotation'].split(';'))
    # print(.split(';'))
    if 'background' in all_facets:
        with_background += 1
    if 'method' in all_facets:
        with_method += 1
    if 'value' in all_facets:
        with_value += 1
    if 'result' in all_facets:
        with_result += 1
print("Total tokens used: ", total_tokens)
print(f"Number of papers: {num_papers}")
print(f"Number of papers with background: {with_background}")
print(f"Number of papers with method: {with_method}")
print(f"Number of papers with value: {with_value}")
print(f"Number of papers with result: {with_result}")
# {result[0]: result[1:].strip() for result in results}

['0 Background', ' 1 Method', ' 2 Value']
['0 Background', ' 1 Background', ' 2 Method', ' 3 Others', ' 4 Result', ' 5 Others', ' 6 Others', ' 7 Method', ' 8 Background', ' 9 Method', ' 10 Background', ' 11 Method', ' 12 Method', ' 13 Result']
['0 Background', ' 1 Method', ' 2 Result', ' 3 Result', ' 4 Result', ' 5 Result', ' 6 Result', ' 7 Result', ' 8 Result', ' 9 Result', ' 10 Result', ' 11 Result.']
['0 Others', ' 1 Background', ' 2 Method', ' 3 Result', ' 4 Method', ' 5 Background', ' 6 Result', ' 7 Value.']
['0 Background', ' 1 Method', ' 2 Result', ' 3 Method', ' 4 Result']
['0 Background', ' 1 Background', ' 2 Background', ' 3 Method', ' 4 Method', ' 5 Value']
['0 Background', ' 1 Method', ' 2 Result', ' 3 Others']
['0 Method', ' 1 Result', ' 2 Background', ' 3 Result', ' 4 Result', ' 5 Result', ' 6 Result']
['0 Others', ' 1 Background', ' 2 Method', ' 3 Result', ' 4 Result', ' 5 Value', ' 6 Result.']
['0 Others', ' 1 Background', ' 2 Method', ' 3 Result', ' 4 Others']
['0 Othe

In [150]:
i = 19
id = facet_results[i]['article_id']
print(facet_results[i]['gpt-annotation'])
print(abstracts[id])

others
['<S> signals sparse in a transformation domain can be recovered from a reduced set of randomly positioned samples by using compressive sensing algorithms . </S>', '<S> simple reconstruction algorithms are presented in the first part of the paper . </S>', '<S> the missing samples manifest themselves as a noise in this reconstruction . </S>', '<S> once the reconstruction conditions for a sparse signal are met and the reconstruction is achieved , the noise due to missing samples does not influence the results in a direct way . </S>', '<S> it influences the possibility to recover a signal only . </S>', '<S> additive input noise will remain in the resulting reconstructed signal . </S>', '<S> the accuracy of the recovery results is related to the additive input noise . </S>', '<S> simple derivation of this relation is presented . </S>', '<S> if a reconstruction algorithm for a sparse signal is used in the reconstruction of a nonsparse signal then the noise due to missing samples will