In [1]:
import openai
from bs4 import BeautifulSoup
import feedparser
import json

In [76]:
import gzip
import pandas as pd

In [68]:
with open("../config.json") as f:
    config = json.load(f)

In [26]:
class ArxivRSS:
    def __init__(self, url):
        self.url = url
        self.papers = dict()

    def fetch_paper_list(self):
        feed = self._fetch_n_parse_rss()

        for rss_entry in feed["entries"]:
            paper_information = self._extract_paper_information(rss_entry)
            self.papers[paper_information["id"]] = paper_information

    def _fetch_n_parse_rss(self):
        feed = feedparser.parse(self.url)
        return feed

    def _parse_html_element(self, raw_string):
        soup = BeautifulSoup(raw_string, "html.parser")
        return soup.text

    def _extract_paper_information(self, rss_entry):
        paper_id = rss_entry["id"]
        paper_title = rss_entry["title"]
        paper_abstract = self._parse_html_element(rss_entry["summary"])
        paper_url = rss_entry["link"]
        paper_authors = []
        for author_info in rss_entry["authors"]:
            author_name = self._parse_html_element(author_info["name"])
            paper_authors.append(author_name)
        return {
            "id": paper_id,
            "title": paper_title,
            "abstract": paper_abstract.replace("\n", " "),
            "url": paper_url,
            "authors": paper_authors,
        }


In [27]:
rss_url = config['arxiv_rss_base_url'] + config['arxiv_subjects'][0]

In [28]:
rss = ArxivRSS(rss_url)

In [29]:
paper_list = rss.fetch_paper_list()

In [30]:
papers = list(rss.papers.values())

In [35]:
print(papers[4]["abstract"])

This document serves as an overview of the different mechanisms and areas of governance in the BigCode project. It aims to support transparency by providing relevant information about choices that were made during the project to the broader public, and to serve as an example of intentional governance of an open research project that future endeavors can leverage to shape their own approach. The first section, Project Structure, covers the project organization, its stated goals and values, its internal decision processes, and its funding and resources. The second section, Data and Model Governance, covers decisions relating to the questions of data subject consent, privacy, and model release. 


In [9]:
rss_url

'http://arxiv.org/rss/cs.CY'

In [10]:
feed = feedparser.parse(rss_url)

In [11]:
feed.feed.keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'language', 'updated', 'updated_parsed', 'publisher', 'publisher_detail', 'tags', 'sy_updatebase', 'sy_updatefrequency', 'sy_updateperiod', 'rdf_li', 'rdf_seq', 'entries', 'image'])

In [12]:
feed.keys()

dict_keys(['bozo', 'entries', 'feed', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [13]:
paper = feed.entries[0]

In [14]:
paper['id']

'http://arxiv.org/abs/2312.03749'

In [15]:
paper

{'id': 'http://arxiv.org/abs/2312.03749',
 'title': 'Conceptual Engineering Using Large Language Models. (arXiv:2312.03749v1 [cs.CL])',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://export.arxiv.org/rss/cs.CY',
  'value': 'Conceptual Engineering Using Large Language Models. (arXiv:2312.03749v1 [cs.CL])'},
 'links': [{'rel': 'alternate',
   'type': 'text/html',
   'href': 'http://arxiv.org/abs/2312.03749'}],
 'link': 'http://arxiv.org/abs/2312.03749',
 'summary': "<p>We describe a method, based on Jennifer Nado's definition of classification\nprocedures as targets of conceptual engineering, that implements such\nprocedures using a large language model. We then apply this method using data\nfrom the Wikidata knowledge graph to evaluate concept definitions from two\nparadigmatic conceptual engineering projects: the International Astronomical\nUnion's redefinition of PLANET and Haslanger's ameliorative analysis of WOMAN.\nWe discuss implications of this work

In [16]:
paper['title']

'Conceptual Engineering Using Large Language Models. (arXiv:2312.03749v1 [cs.CL])'

In [17]:
paper['summary']

"<p>We describe a method, based on Jennifer Nado's definition of classification\nprocedures as targets of conceptual engineering, that implements such\nprocedures using a large language model. We then apply this method using data\nfrom the Wikidata knowledge graph to evaluate concept definitions from two\nparadigmatic conceptual engineering projects: the International Astronomical\nUnion's redefinition of PLANET and Haslanger's ameliorative analysis of WOMAN.\nWe discuss implications of this work for the theory and practice of conceptual\nengineering. The code and data can be found on GitHub.\n</p>"

In [18]:
paper['link']

'http://arxiv.org/abs/2312.03749'

In [20]:
soup = BeautifulSoup(paper['authors'][0]['name'])

In [24]:
parse_html_element(paper['authors'][0]['name'])

'Bradley P. Allen'

In [28]:
paper_info = extract_paper_information(feed.entries[1])

In [29]:
paper_info

{'id': 'http://arxiv.org/abs/2312.03755',
 'title': 'Near-real-time Earthquake-induced Fatality Estimation using Crowdsourced Data and Large-Language Models. (arXiv:2312.03755v1 [cs.CL])',
 'abstract': "When a damaging earthquake occurs, immediate information about casualties is\ncritical for time-sensitive decision-making by emergency response and aid\nagencies in the first hours and days. Systems such as Prompt Assessment of\nGlobal Earthquakes for Response (PAGER) by the U.S. Geological Survey (USGS)\nwere developed to provide a forecast within about 30 minutes of any significant\nearthquake globally. Traditional systems for estimating human loss in disasters\noften depend on manually collected early casualty reports from global media, a\nprocess that's labor-intensive and slow with notable time delays. Recently,\nsome systems have employed keyword matching and topic modeling to extract\nrelevant information from social media. However, these methods struggle with\nthe complex semant

In [36]:
def merge_dicts(list_of_dicts):
    merged = {}
    for d in list_of_dicts:
        merged.update(d)
    return merged

In [37]:
dict1 = {'a': 1, 'b': 2}
dict2 = {'b': 3, 'c': 4}
dict3 = {'c': 5, 'd': 6}

In [38]:
merge_dicts([dict1, dict2, dict3])

{'a': 1, 'b': 3, 'c': 5, 'd': 6}

In [39]:
with open("../data.json") as f:
    data = json.load(f)

In [40]:
len(data)

124

In [53]:
data[list(data.keys())[11]]

{'id': 'http://arxiv.org/abs/2312.04275',
 'title': 'Estimating Countries with Similar Maternal Mortality Rate using Cluster Analysis and Pairing Countries with Identical MMR. (arXiv:2312.04275v1 [cs.LG])',
 'abstract': "In the evolving world, we require more additionally the young era to flourish\nand evolve into developed land. Most of the population all around the world are\nunaware of the complications involved in the routine they follow while they are\npregnant and how hospital facilities affect maternal health. Maternal Mortality\nis the death of a pregnant woman due to intricacies correlated to pregnancy,\nunderlying circumstances exacerbated by the pregnancy or management of these\nsituations. It is crucial to consider the Maternal Mortality Rate (MMR) in\ndiverse locations and determine which human routines and hospital facilities\ndiminish the Maternal Mortality Rate (MMR). This research aims to examine and\ndiscover the countries which are keeping more lavish threats of MMR 

In [57]:
with gzip.open("../data/2023-12-08.json.gz") as f:
    data = json.load(f)

In [58]:
len(data)

124

In [81]:
    user_message = """
        Please read the following paper title and abstract:
        --------------
        Title: {title}
        Abstract: {abstract}
        --------------
        Based on the title and abstract, please decide if the paper pertains to one or multiple topics below:
        --------------
        {topics}
        --------------
        If the paper is VERY relevant to a topic, provide a short explanation of why. If the the paper is not relevant to any of the topics, reply False and leave the reason empty. The output should be in JSON format and follow the following schema:
        --------------
        ```json
        {{
            'topic 1': {{
                'relevance': 0,
                'reason': ''
            }},
            'topic 2': {{
                'relevance': 0.9,
                'reason': 'The paper ....'
            }}
        }}
         ```
    """

In [62]:
with gzip.open("../data/2023-12-08.json.gz") as f:
    merged_paper_list = json.loads(f.read().decode("utf8"))

In [77]:
temp_df = pd.DataFrame.from_dict(merged_paper_list.values())

In [64]:
paper = list(merged_paper_list.values())[0]

In [82]:
print(user_message.format(
    title=paper['title'],
    abstract=paper['abstract'],
    topics=config['topics']
))


    Please read the following paper title and abstract:
    --------------
    Title: Conceptual Engineering Using Large Language Models. (arXiv:2312.03749v1 [cs.CL])
    Abstract: We describe a method, based on Jennifer Nado's definition of classification
procedures as targets of conceptual engineering, that implements such
procedures using a large language model. We then apply this method using data
from the Wikidata knowledge graph to evaluate concept definitions from two
paradigmatic conceptual engineering projects: the International Astronomical
Union's redefinition of PLANET and Haslanger's ameliorative analysis of WOMAN.
We discuss implications of this work for the theory and practice of conceptual
engineering. The code and data can be found on GitHub.

    --------------
    Based on the title and abstract, please decide if the paper pertains to one or multiple topics below:
    --------------
    ['Review of existing research', 'Security of AI and langauge models']
    ------

In [70]:
config['topics']

['Review of existing research', 'Security of AI and langauge models']

# Output

In [144]:

juedement_list = []
with open("../data/2023-12-08.resp.json") as f:
    for line in f:
        paper = json.loads(line)
        juedement_list.append(paper)

In [145]:
len(juedement_list)

128

In [146]:
juedement_list[0]

{'id': 'http://arxiv.org/abs/2312.03749',
 'judgement': {'Security of AI and language models': {'relevance': 0,
   'reason': ''},
  'Applications of AI and language models in social science research': {'relevance': 0.8,
   'reason': 'The paper discusses the application of large language models in evaluating concept definitions from social science projects.'},
  'Using AI to simulate humans in various contexts': {'relevance': 0.2,
   'reason': ''},
  'Methods to increase the factuality of language model response': {'relevance': 0.1,
   'reason': ''},
  'AI and language models for generating misinformation or fact-checking': {'relevance': 0.1,
   'reason': ''}}}

In [137]:
paper_worth_reading = []
for judgement in juedement_list:
    score_reason = []
    for topic, value in judgement.items():
        if topic == "id":
            continue
        if value['relevance'] > 0.8:
            score_reason.append(
                f"<{value['relevance']}> - <{topic}>: {value['reason']}"
            )
    if score_reason:
        paper = {
            "id": judgement['id'],
            "reason": " || ".join(score_reason)
        }
        paper_worth_reading.append(paper)

In [138]:
juedement_list[0].keys()

dict_keys(['Security of AI and language models', 'Applications of AI and language models in social science research', 'Using AI to simulate humans in various contexts', 'Methods to ground the response of language models', 'AI and language models for generating misinformation or fact-checking', 'id'])

In [139]:
len(paper_worth_reading)

6

In [140]:
len(paper_worth_reading)

6

In [142]:
paper_worth_reading_df = pd.DataFrame.from_dict(paper_worth_reading)

In [143]:
paper_worth_reading_df

Unnamed: 0,id,reason
0,http://arxiv.org/abs/2312.03755,<0.9> - <Applications of AI and language model...
1,http://arxiv.org/abs/2312.03901,<0.9> - <Applications of AI and language model...
2,http://arxiv.org/abs/2312.03936,<0.9> - <AI and language models for generating...
3,http://arxiv.org/abs/2303.16343,<1> - <Applications of AI and language models ...
4,http://arxiv.org/abs/2309.08967,<0.9> - <Applications of AI and language model...
5,http://arxiv.org/abs/2312.03707,<0.9> - <Methods to ground the response of lan...
