In [1]:
import requests
import json
import pickle
import xml.etree.ElementTree as ET
import re
import openai
import os
import pandas as pd
import time

In [32]:
def write_facet_results(facet_results, file_name):
    # check whether file exists
    if os.path.exists(file_name):
        print("File already exists. Overwrite? (y/n)")
        if input() != 'y':
            return
    with open(file_name, 'w') as output_file:
        for d in facet_results:
            output_file.write(json.dumps(d)+'\n')

def read_facet_results(file_name):
    with open(file_name, "r") as f:
        lines = f.readlines()
    facet_results = []
    for line in lines:
        facet_results.append(json.loads(line))
    return facet_results

def annotate_abstracts_by_ssc(abstracts, max_examples=30):
    print("Annotating abstracts by single sentence classification...")
    print('Total number of examples: ', len(abstracts))

    prefix_prompt = """
    You are a classifier to label the text between the tag <S> and </S> into one of the four classes: background, methods, results, values. If you are uncertain, output "others".  You can only output one word from labels or "others".
    """

    facet_results = []

    for id, sentences  in list(abstracts.items())[:max_examples]: 
        
        for sent in sentences:
            if "* keywords" in sent: 
                continue    # ignore keywords
            # message = prefix_prompt + sent
            output = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": prefix_prompt},
                    {"role": "user", "content": sent}
                ]
            )
            facet_label = output['choices'][0]['message']['content']
            facet_results.append({
                "article_id": id, 
                "sent": sent, 
                'gpt-annotation': facet_label, 
                "prompt_tokens": output.get("usage").get('prompt_tokens'),
                "completion_tokens": output.get("usage").get('completion_tokens'),
                "total_tokens": output.get("usage").get('total_tokens')}
            )
    return facet_results


def annotate_abstracts_by_tagging(abstracts):
    prompt = """
        Task: You are a tagger to label each subtext between the tags <NUM> and </NUM> with one of four labels, where NUM is a placeholder for the numeric index of a sentence:

        The labels are:
        1. Background: This sentence provides context and motivation for the research by discussing previous work and highlighting the need for further investigation or a statement of the research problem.
        2. Method: The setence outlines the approach used to carry out the research, including the experimental design, data collection and analysis methods, and any algorithms or models used.
        3. Result: The sentence presents the findings of the research, quantitative or qualitative analysis or the main outcomes.
        4. Value: The sentence discusses the broader impact and societal implications of the paper.
        5. Others: If you are uncertain which class the sentence belongs to, label it as "others". 

        Output includes NUM-label pairs separated by a semicolon and give only one label to each NUM, e.g., "1 Others; 2 Method".

        Here is the input for tagging:

    """
    # Example Input: "<0> matrix data sets are common nowadays like in biomedical imaging </0> <1> we design a fast and easy - to - implement iterative algorithm to approximate arbitrarily finely these extremal matrices</1>"
    # Example Output: "1 Others; 2 Method"

    facet_results = []
    total_tokens = 0
    for id, sentences  in list(abstracts.items())[:14]: 
        # remove <S> and </S> tags
        plain_sentences = [sent.replace('<S>', '').replace('</S>', '') for sent in sentences if "* keywords" not in sent]

        # add <NUM> and </NUM> tags for each sentence where NUM is 1, 2, 3, ...
        tagged_sentences = ' '.join([f" <{i}> {sent.strip()} </{i}> " for i, sent in enumerate(plain_sentences) ] )
        
        
        specific_prompt = prompt + tagged_sentences
    
        output = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": specific_prompt}
            ]
        )
        gpt_annotation = output['choices'][0]['message']['content']

        facet_results.append({
            "article_id": id, 
            "sent": plain_sentences, 
            'gpt_annotation': gpt_annotation, 
            "prompt_tokens": output.get("usage").get('prompt_tokens'),
            "completion_tokens": output.get("usage").get('completion_tokens'),
            "total_tokens": output.get("usage").get('total_tokens')}
        )
        total_tokens += output.get("usage").get('total_tokens')
    print("Total tokens used: ", total_tokens)
    return facet_results

## GPT Annotation

In [11]:
def load_abstracts_from_arxiv(category='cs'):
    # load all domains of papers
    i = 5
    inp_path = '../arxiv-dataset/train' + str(i) +  '.txt'
    only_cat = []
    with open(inp_path, "r") as f:
        lines = f.readlines()

    papers = []
    for line in lines:
        papers.append(json.loads(line))

    # aggregate abstracts for papers in the category
    with open('ids_for_cats.pickle', 'rb') as handle:
        ids_for_cats2 = pickle.load(handle)

    abstracts = {}
    for i in ids_for_cats2[category]: 
        abstracts[papers[i]['article_id']] = papers[i]['abstract_text']
    return abstracts


In [39]:
openai.api_key = os.getenv("OPENAI_API_KEY")


# load abstract
abstracts = load_abstracts_from_arxiv(category='cs')
print(f"The number of abstracts in the category: {len(abstracts)}")

# annotate abstracts by tagging
facet_results = annotate_abstracts_by_tagging(abstracts)


# post processing: extract the facets from the GPT textual outputs
for result in facet_results:
    # remove </0>, </1>, ...
    result['gpt_annotation'] = re.sub(r'</\d>', '', result['gpt_annotation']).replace('<', '').replace('>', '').replace(' ', '').replace('.', '')

    # split facets by ;
    facets = result['gpt_annotation'].split(';')

    # remove empty string in facets
    facets = [facet for facet in facets if facet != '']

    new_facets = []
    # print(facets)
    for i, facet in enumerate(facets):
        assert str(i) in facet
        new_facets.append(facet.replace(str(i), '').strip())
    try:
        assert len(new_facets) == len(result['sent'])
    except:
        print(len(new_facets), len(result['sent']))
        print(result['sent'])
        print(new_facets)
    print(new_facets)
        
    result['facets'] = new_facets

['Background', 'Method', 'Result']
['Others', 'Background', 'Background', 'Result', 'Method', 'Result', 'Others']
['Background', 'Background', 'Background', 'Method', 'Method', 'Result', 'Result']
['Background', 'Method', 'Method', 'Result', 'Result', 'Result', 'Value']
['Background', 'Background', 'Result', 'Method', 'Method', 'Method', 'Result']
['Background', 'Method', 'Method', 'Method', 'Background', 'Result', 'Result', 'Result']
['Others', 'Result']
['Background', 'Background', 'Method']
['Background', 'Result', 'Result', 'Method']
['Background', 'Background', 'Method', 'Result', 'Result', 'Result', 'Result', 'Value']


In [None]:

faceted_results = read_facet_results("cs5_abstract-tag.json")

read_facet_results

# this is used temporarily for old versions of the facet results
for result in faceted_results:
    result['sent'] = [sent for sent in result['sent'] if "* keywords" not in sent]
    facets = result['gpt_annotation'].replace('.', '').split(';')
    new_facets = []
    for i, facet in enumerate(facets):
        assert str(i) in facet
        new_facets.append(facet.replace(str(i), '').strip())
    try:
        assert len(new_facets) == len(result['sent'])
    except:
        print(len(new_facets), len(result['sent']))
        print(result['sent'])
        print(new_facets)
        
    result['facets'] = new_facets


## Human Annotation

In [None]:
# load abstracts
file_dir = "annotation/human_annotation"
with open(f'{file_dir}test_abstracts.json', 'r') as f:
    abstracts = json.load(f)

In [None]:
def read_abstracts_and_human_annotation(FILE_NAME):
    # read the merged dataframe
    df = pd.read_excel(FILE_NAME)
    # print the number of rows in the merged dataframe
    print(len(df))
    print(len(df.columns))
    
    orig_columns = df.columns
    if len(df.columns) == 9:
        df.columns = ['article_id', 'abstract', 'Background', 'Purpose', 'Method', 'Result', 'Conclusions', 'Limitation', 'Others']
    if len(df.columns) == 14:
        df.columns = ['dataset_type', 'row_id', 'article_id', 'category', 'abstract', 'Background', 'Purpose', 'Method', 'Result', 'Conclusions', 'Limitation', 'Others',  'Errors', 'TooLong']
        assert len(df.columns) == 14
    for orig_column, column in zip(orig_columns, df.columns):
        print(orig_column, '->', column)

    abstracts = {}
    orig_abstracts = {}
    facets_for_abstracts = {}
    for row in df.iterrows():
        # split the abstract into sentences according to the tages <S0>, <S1>, <S2>, ...
        sentences =  row[1]['abstract'].split("<S")[1:]        
        sentences = [sentence[sentence.find(">")+1:] for sentence in sentences] # get the content of each sentence
        sentences = [sentence[:sentence.find("</S")] for sentence in sentences] # remove the tags </S0>, </S1>, </S2>, ...

        # extract facet for each sentence
        facets = [None] * len(sentences)
        for i in range(len(sentences)):
            identifier = 'S'+str(i)
            for tag in [ 'Background', 'Purpose', 'Method', 'Result', 'Conclusions', 'Limitation', 'Others', ]:
                sent_ids = row[1][tag]
                if type(sent_ids) == float:
                    continue
                if identifier in sent_ids:
                    if facets[i] == None:
                        facets[i] = [tag]
                    else:
                        facets[i].append(tag)
            if facets[i] == None:
                facets[i] = ["Others"]
                    

        abstracts[str(row[1]['article_id'])] = sentences
        orig_abstracts[str(row[1]['article_id'])] = row[1]['abstract']
        facets_for_abstracts[str(row[1]['article_id'])] = facets
    return abstracts, facets_for_abstracts, orig_abstracts

# file_name = 'annotation/gpt_vs_human_annotation/test'
# abstracts, facets_for_abstracts, orig_abstracts = read_abstracts_and_human_annotation(file_name+".xlsx")
# with open(f'{file_name}_abstracts.json', 'w') as fp:
#     json.dump(abstracts, fp)

# with open(f'{file_name}_facets.json', 'w') as fp:
#     json.dump(facets_for_abstracts, fp)


In [None]:

# Generate files for Error Analysis

# generating a random number from 0 to 499
# import random
# random.seed(42)

# file_name = "error_analysis"
# with open(file_name+".txt", "w") as f:
#     for i in range(10):
#         random_number = random.randint(0, 499)
#         f.write('Example '+str(random_number))
#         f.write("\n")
#         f.write(list(orig_abstracts.values())[random_number])
#         f.write("\n")
#         f.write("  ".join([str(i) +' ' +' '.join(sublst) for i, sublst in enumerate(list(facets_for_abstracts.values())[random_number])]))
#         f.write("\n")
#         f.write("\n")

facets_for_abstracts = 
file_name = "error_analysis_test"
with open(file_name+".txt", "w") as f:
    for random_number in range(10):
  
        f.write('Example '+str(random_number))
        f.write("\n")
        f.write(list(orig_abstracts.values())[random_number])
        f.write("\n")
        f.write("  ".join([str(i) +' ' +' '.join(sublst) for i, sublst in enumerate(list(facets_for_abstracts.values())[random_number])]))
        f.write("\n")
        f.write("\n")
    



In [58]:
with open('abstract_tagging/test_facets.json', 'r') as f:
    ground_truth = json.load(f)
    
ground_truth = [tags for tags in ground_truth.values()]

# compare the facets
total = 0
correct = 0
error_analysis = []
for i, abstract_facets in enumerate([result['facets'] for result in facet_results]):
    
    for j, facet in enumerate(abstract_facets):
        # remove Purpose, Conclusions, Limitation
        ground_truth[i][j] = [tag for tag in ground_truth[i][j] if tag not in ['Purpose', 'Conclusions', 'Limitation']]
        ground_truth[i][j] = ["Others"] if ground_truth[i][j] == [] else ground_truth[i][j]

        if facet in ground_truth[i][j]:
            correct += 1
        else:
            error_analysis.append((list(abstracts.values())[i][j], facet, ground_truth[i][j]))

        total += 1
print(correct, total, correct/total)

31 56 0.5535714285714286


In [59]:
error_analysis

[('For the pedestrian observer, financial markets look completely random with erratic and uncontrollable behavior.',
  'Others',
  ['Background']),
 ('At first approximation the difference between real price changes and the random walk model is too small to be detected using traditional time series analysis.',
  'Background',
  ['Others']),
 (' However, we show in the following that this difference between real financial time series and random walks, as small as it is, is detectable using modern statistical multivariate analysis, with several triggers encoded in trading systems.',
  'Result',
  ['Method']),
 (' We develop Demand Response Management (DRM) plans that clearly spell out the maximum duration as well as maximum severity of inconvenience.',
  'Method',
  ['Others']),
 (' However, the peak load that can be reduced is diminishing with the increas e in number of states.',
  'Result',
  ['Others']),
 (' These observations can serve as useful guidelines for developing appropriate 

In [46]:
# write_facet_results(facet_results, "abstract_tagging/test_facets_from_gpt.json")


In [55]:
facet_results = read_facet_results('cs5_abstract-tag.json')
num_papers = len(facet_results)
with_background = 0
with_method = 0
with_value = 0
with_result = 0
total_tokens = 0
for result in facet_results:
    total_tokens += result['total_tokens']
    all_facets = result['gpt_annotation'].lower()
    # print(result['sent'])
    # print(result['gpt_annotation'].split(';'))
    # print(.split(';'))
    if 'background' in all_facets:
        with_background += 1
    if 'method' in all_facets:
        with_method += 1
    if 'value' in all_facets:
        with_value += 1
    if 'result' in all_facets:
        with_result += 1
print("Total tokens used: ", total_tokens)
print(f"Number of papers: {num_papers}")
print(f"Number of papers with background: {with_background/num_papers}")
print(f"Number of papers with method: {with_method/num_papers}")
print(f"Number of papers with value: {with_value/num_papers}")
print(f"Number of papers with result: {with_result/num_papers}")
# {result[0]: result[1:].strip() for result in results}

Total tokens used:  7429
Number of papers: 14
Number of papers with background: 0.9285714285714286
Number of papers with method: 0.9285714285714286
Number of papers with value: 0.35714285714285715
Number of papers with result: 0.7857142857142857


In [150]:
i = 19
id = facet_results[i]['article_id']
print(facet_results[i]['gpt-annotation'])
print(abstracts[id])

others
['<S> signals sparse in a transformation domain can be recovered from a reduced set of randomly positioned samples by using compressive sensing algorithms . </S>', '<S> simple reconstruction algorithms are presented in the first part of the paper . </S>', '<S> the missing samples manifest themselves as a noise in this reconstruction . </S>', '<S> once the reconstruction conditions for a sparse signal are met and the reconstruction is achieved , the noise due to missing samples does not influence the results in a direct way . </S>', '<S> it influences the possibility to recover a signal only . </S>', '<S> additive input noise will remain in the resulting reconstructed signal . </S>', '<S> the accuracy of the recovery results is related to the additive input noise . </S>', '<S> simple derivation of this relation is presented . </S>', '<S> if a reconstruction algorithm for a sparse signal is used in the reconstruction of a nonsparse signal then the noise due to missing samples will