# ArXiv dataset with Weaviate

In [41]:
import weaviate
import json
import tqdm
import re
import time

year_pattern = r'([1-2][0-9]{3})'

In [42]:
def get_client():
    client = weaviate.Client("http://localhost:8080")
    meta_info = client.get_meta()
    print(meta_info)
    return client

In [43]:
client = get_client()

{'contextionaryVersion': 'en0.16.0-v0.4.19', 'contextionaryWordCount': 818107, 'hostname': 'http://[::]:8080', 'version': '0.22.15'}


In [44]:
# get ids of categories
def get_ids_of_categories():
    categories_with_uuids = client.query.get.things("Category", ["id", "uuid"]).with_limit(2000).do()
    categories_with_uuids = categories_with_uuids['data']['Get']['Things']['Category']
    categories_with_uuids_dict = {}
    for category in categories_with_uuids:
        categories_with_uuids_dict[category['id']] = category['uuid']
    return categories_with_uuids_dict

In [45]:
categories_with_uuids_dict = get_ids_of_categories()

In [46]:
# {title, doi, year, journalReference, arXivId, submitter, abstract, comments, hasCategories, versionHistory, lastestVersionCreated, lastestVersion, pdfLink, link, licence, reportNumber, hasAuthors, inJournal}
def get_metadata():
    with open('../data/arxiv-metadata-oai.json', 'r') as f:
        for line in f:
            yield line

In [47]:
# test
metadata = get_metadata()
for paper in metadata:
    for k, v in json.loads(paper).items():
        print(f'{k}: {v}')
    break

id: 0704.0001
submitter: Pavel Nadolsky
authors: C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan
title: Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies
comments: 37 pages, 15 figures; published version
journal-ref: Phys.Rev.D76:013009,2007
doi: 10.1103/PhysRevD.76.013009
abstract:   A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions 

In [48]:
def get_journal_name(a_string):
    splitted = re.split('([0-9]+)', a_string)
    return splitted[0]

In [49]:
def get_journal_uuid(name):
    # check if journal exists
    where_filter = {
      "path": ["name"],
      "operator": "Equal",
      "valueString": name
    }

    result = client.query.get.things("Journal", ["uuid"]).with_where(where_filter).with_limit(10000).do()
    
    journals = result['data']['Get']['Things']['Journal']
    if len(journals) > 0:
        return journals[0]["uuid"]
    else: # journal does not exist yet
        data_obj = {"name": name}
        create_result = client.data_object.create(data_obj, "Journal")
        time.sleep(1)
        return create_result

In [50]:
def format_author_name(author):
    regex = re.compile(r'[\n\r\t\'\\\"\`]')
    return regex.sub('', author)

In [51]:
def get_author_uuid(name):
    # check if journal exists
    where_filter = {
      "path": ["name"],
      "operator": "Equal",
      "valueString": name
    }

    result = client.query.get.things("Author", ["uuid"]).with_where(where_filter).with_limit(10000).do()
    authors = result['data']['Get']['Things']['Author']
    if len(authors) > 0:
        return authors[0]["uuid"]
    else: # journal does not exist yet
        data_obj = {"name": name}
        create_result = client.data_object.create(data_obj, "Author")
        time.sleep(1)
        return create_result

In [52]:
def extract_year(paper_id):
    year = 2000 + int(paper_id[:2])
        
    return year

In [53]:
def add_papers():
    metadata = get_metadata()

    batch = weaviate.ThingsBatchRequest()
    no_papers_in_batch  = 0

    test_round = 1

    for paper in metadata:
        paper = json.loads(paper)
        paper_object = {}

        if paper["title"] is not None: paper_object["title"] = paper["title"].replace('\n', ' ')
        if paper["doi"] is not None: paper_object["doi"] = paper["doi"]
        if paper["journal-ref"] is not None: paper_object["journalReference"] = paper["journal-ref"]
        if paper["id"] is not None: paper_object["arxivId"] = paper["id"]
        if paper["submitter"] is not None: paper_object["submitter"] = paper["submitter"]
        if paper["abstract"] is not None: paper_object["abstract"] = paper["abstract"].replace('\n', ' ')
        if paper["comments"] is not None: paper_object["comments"] = paper["comments"]
        if paper["report-no"] is not None: paper_object["reportNumber"] = paper["report-no"]
        if paper["versions"] is not None: 
            paper_object["versionHistory"] = str(paper["versions"]).strip('[]')
            paper_object["lastestVersion"] = paper["versions"][-1]

        # try to extract year
        if paper["id"] is not None:
            year = extract_year(paper["id"])
            paper_object["year"] = year

        paper_object["hasCategories"] = []
        for category in paper["categories"][0].split(' '): # id of category
            # create beacon
            beacon_url = "weaviate://localhost/things/" + categories_with_uuids_dict[category]
            beacon = {"beacon": beacon_url}
            paper_object["hasCategories"].append(beacon)

        # journal
        if paper["journal-ref"] is not None:
            journal_name = get_journal_name(paper["journal-ref"])
            journal_uuid = get_journal_uuid(journal_name.replace('\n', ' '))

            beacon = "weaviate://localhost/things/" + journal_uuid
            paper_object['inJournal'] = [{
                "beacon": beacon
            }]

        # authors
        if paper["authors"] is not None:

            # remove everything between parentheses (twice for recursion)
            result = format_author_name(paper["authors"])
            result = re.sub(r'\(.*\)', '', result)
            result = re.sub("[\(\[\{].*?[\)\]\}]", "", result)

            authors = result.split(', ')

            authors_object = []
            for author in authors:
                author_uuid = get_author_uuid(author)
                beacon = "weaviate://localhost/things/" + author_uuid
                authors_object.append({'beacon': beacon})

            if len(authors_object) > 0:
                paper_object['hasAuthors'] = authors_object

        batch.add_thing(paper_object, "Paper") 
        no_papers_in_batch += 1
        if no_papers_in_batch > 9:
            result = client.batch.create_things(batch)
            batch = weaviate.ThingsBatchRequest()
            no_papers_in_batch = 0
            test_round += 1

        if test_round > 1:
            break

    # TO DO: lastestVersionCreated, pdfLink, link, licence, hasAuthors}

In [54]:
def add_articles_to_authors():
    query = "{Get {Things {Paper {uuid HasAuthors {... on Author {name uuid}}}}}}"
    result = client.query.raw(query)
    
    data = result['data']['Get']['Things']['Paper']
    
    for paper in data:
        paper_uuid = paper["uuid"]
        authors = paper["HasAuthors"]
        
        for author in authors:
            author_uuid = author["uuid"]
            client.data_object.reference.add(author_uuid, "wrotePapers", paper_uuid)

In [55]:
add_papers()
time.sleep(2)
add_articles_to_authors()