In [17]:
import weaviate
import json
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import copy

In [18]:
def get_client():
    client = weaviate.Client("http://localhost:8080")
    meta_info = client.get_meta()
    print(meta_info)
    return client

In [19]:
client = get_client()

{'contextionaryVersion': 'en0.16.0-v0.4.19', 'contextionaryWordCount': 818107, 'hostname': 'http://[::]:8080', 'version': '0.22.15'}


In [20]:
def load_taxanomy():
    ## load taxonomy from https://arxiv.org/category_taxonomy
    website_url = requests.get('https://arxiv.org/category_taxonomy').text
    soup = BeautifulSoup(website_url,'lxml')

    root = soup.find('div',{'id':'category_taxonomy_list'})

    tags = root.find_all(["h2","h3","h4","p"], recursive=True)

    level_1_name = ""
    level_2_code = ""
    level_2_name = ""

    level_1_names = []
    level_2_codes = []
    level_2_names = []
    level_3_codes = []
    level_3_names = []
    level_3_notes = []

    for t in tags:
        if t.name == "h2":
            level_1_name = t.text    
            level_2_code = t.text
            level_2_name = t.text
        elif t.name == "h3":
            raw = t.text
            level_2_code = re.sub(r"(.*)\((.*)\)",r"\2",raw)
            level_2_name = re.sub(r"(.*)\((.*)\)",r"\1",raw)
        elif t.name == "h4":
            raw = t.text
            level_3_code = re.sub(r"(.*) \((.*)\)",r"\1",raw)
            level_3_name = re.sub(r"(.*) \((.*)\)",r"\2",raw)
        elif t.name == "p":
            notes = t.text
            level_1_names.append(level_1_name)
            level_2_names.append(level_2_name)
            level_2_codes.append(level_2_code)
            level_3_names.append(level_3_name)
            level_3_codes.append(level_3_code)
            level_3_notes.append(notes)

    df_taxonomy = pd.DataFrame({
        'group_name' : level_1_names,
        'archive_name' : level_2_names,
        'archive_id' : level_2_codes,
        'category_name' : level_3_names,
        'category_id' : level_3_codes,
        'category_description': level_3_notes

    })
    #df_taxonomy.to_csv("arxiv-metadata-ext-taxonomy.csv", index=False)
    #df_taxonomy.groupby(["group_name","archive_name"]).head(3)

    groups = [] # {name}
    archives = [] # {name, id, inGroup}
    categories = [] # {name, id, description, inArchive}

    group_names = list(set(level_1_names))
    for name in group_names:
        groups.append({"name": name})

    df_archives = pd.DataFrame({
        'inGroup' : level_1_names,
        'name' : level_2_names,
        'id' : level_2_codes

    })
    df_archives.drop_duplicates(inplace=True, ignore_index=True)
    archives = df_archives.to_dict(orient="records")

    df_categories = pd.DataFrame({
        'inArchive' : level_2_names,
        'name' : level_3_names,
        'id' : level_3_codes,
        'description' : level_3_notes
    })
    df_categories.drop_duplicates(inplace=True, ignore_index=True)
    categories = df_categories.to_dict(orient="records")
    
    return groups, archives, categories

In [21]:
groups, archives, categories = load_taxanomy()

In [22]:
print(groups)

[{'name': 'Electrical Engineering and Systems Science'}, {'name': 'Statistics'}, {'name': 'Mathematics'}, {'name': 'Computer Science'}, {'name': 'Quantitative Finance'}, {'name': 'Economics'}, {'name': 'Physics'}, {'name': 'Quantitative Biology'}]


In [23]:
def add_groups(groups):
    # add groups to weaviate
    batch = weaviate.ThingsBatchRequest()
    for group in groups:
        batch.add_thing(group, "Group")
    client.batch.create_things(batch)
    time.sleep(2)

In [24]:
add_groups(groups)

In [25]:
def get_ids_of_groups():
    # get ids of groups
    groups_with_uuids = client.query.get.things("Group", ["name", "uuid"]).do()
    groups_with_uuids = groups_with_uuids['data']['Get']['Things']['Group']
    groups_with_uuids_dict = {}
    for group in groups_with_uuids:
        groups_with_uuids_dict[group['name']] = group['uuid']
    return groups_with_uuids_dict

In [26]:
groups_with_uuids_dict = get_ids_of_groups()
print(groups_with_uuids_dict)

{'Statistics': '2d2182cd-b0e7-4683-bddc-48e1d8d5e2c8', 'Mathematics': '5da91aea-ad85-4aa2-bb93-32736144eda5', 'Quantitative Finance': '4e66509a-bb8f-4b54-a774-cfa2267cddd4', 'Electrical Engineering and Systems Science': '02335df5-e332-486b-8206-1ac3cdd347a2', 'Computer Science': 'bec10552-ad8a-4b10-a4e9-4968683be506', 'Physics': 'e4487feb-d0a9-4191-858c-2b17c24c3ddd', 'Economics': '0c40d4d2-55ed-4183-874e-9ff7875c3545', 'Quantitative Biology': 'b08e86e6-c838-4771-ba9b-c3bf234814ac'}


In [27]:
def add_archives(archives):
    groups_with_uuids_dict = get_ids_of_groups()
    # add archives to weaviate
    batch = weaviate.ThingsBatchRequest()

    archives_copy = archives
    for archive in archives_copy:
        group_beacon = "weaviate://localhost/things/" + groups_with_uuids_dict[archive['inGroup']]
        archive['inGroup'] = [{
            "beacon": group_beacon
        }]
        batch.add_thing(archive, "Archive")

    client.batch.create_things(batch)
    time.sleep(2)

In [28]:
add_archives(archives)

In [29]:
def get_ids_of_archives():
    # get ids of archives
    archives_with_uuids = client.query.get.things("Archive", ["name", "uuid"]).do()
    archives_with_uuids = archives_with_uuids['data']['Get']['Things']['Archive']
    archives_with_uuids_dict = {}
    for archive in archives_with_uuids:
        archives_with_uuids_dict[archive['name']] = archive['uuid']
    return archives_with_uuids_dict

In [30]:
archives_with_uuids_dict = get_ids_of_archives()
print(archives_with_uuids_dict)

{'Astrophysics': '4601a64f-70bd-4c3b-a233-a8d361f0ca16', 'Condensed Matter': 'dae7dc58-ad3c-4088-8599-dff2743103e7', 'High Energy Physics - Experiment': '97395fa3-ba03-442e-b5bb-4085a6c9aef2', 'High Energy Physics - Lattice': 'baff8c08-2697-4746-806a-c4ebda31c28c', 'Nonlinear Sciences': 'f50513e3-aaf3-4eba-b3e7-244bf837e0d2', 'Nuclear Theory': '3e961d27-97e0-4040-ac02-98eb63f30097', 'Computer Science': '3132ccf1-f42b-48d1-890c-46a1b8d8b42c', 'Economics': '1d888fff-0838-43b1-9012-9b13d8664e33', 'Mathematics': 'a96afcaf-2d7c-4030-83b2-ff66f267adbd', 'Nuclear Experiment': 'd332032c-701d-4496-b72e-5ba13fde4c70', 'Electrical Engineering and Systems Science': '0525119c-8de5-485f-bc94-bf7c7398b99f', 'General Relativity and Quantum Cosmology': '2ebefcbd-a7a3-457d-bfd4-18711348afc8', 'High Energy Physics - Phenomenology': '2ecfcf64-577e-43aa-ac77-175e2626f526', 'High Energy Physics - Theory': '68845139-0599-49db-bb19-861de7e20084', 'Mathematical Physics': '0e598c6c-3e6e-415d-b89e-4b9d341a1c41',

In [31]:
def add_categories(categories):
    
    archives_with_uuids_dict = get_ids_of_archives()
    
    # add categories to weaviate
    batch = weaviate.ThingsBatchRequest()

    categories_copy = categories
    category_ids = []

    for category in categories_copy:
        category_copy = copy.deepcopy(category)
        archive_beacon = "weaviate://localhost/things/" + archives_with_uuids_dict[category['inArchive']]
        category_copy['inArchive'] = [{
            "beacon": archive_beacon
        }]
        batch.add_thing(category_copy, "Category")

        # also create archive for the category archive if not exist yet (e.g. "cs" for the category id "cs.AI"), because some items are labeled wrong in the dataset

        # check if archive exists
        if (category['id'].split('.')[0] not in category_ids) and (category['id'].split('.')[0] != category['id']):
            category_ids.append(category['id'].split('.')[0])

            extra_category = {}
            extra_category["name"] = category['inArchive']
            extra_category["id"] = category['id'].split('.')[0]
            extra_category['inArchive'] = [{
                "beacon": archive_beacon
            }]

            batch.add_thing(extra_category, "Category")

    client.batch.create_things(batch)
    time.sleep(2)

In [32]:
add_categories(categories)