In [8]:
import bz2
import json
from tqdm import tqdm
import traceback
import os
import sys
from pymongo import MongoClient
from json.decoder import JSONDecodeError
from requests import get

In [9]:
# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
DB_NAME = f"wikidata"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
print(client)

log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True)


In [10]:
def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []

def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids

In [11]:
json_file_path = "./data/def_mapping.json"

try:
    # Open the JSON file for reading
    with open(json_file_path, 'r') as json_file:
        mapping = json.load(json_file)
        
except FileNotFoundError:
    print(f"Error: File '{json_file_path}' not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON data: {e}")
except Exception as e:
    print(f"Error loading data from JSON file: {e}")

In [12]:
total_size_processed = 0
num_entities_processed = 0

def update_average_size(new_size):
    global total_size_processed, num_entities_processed
    total_size_processed += new_size
    num_entities_processed += 1
    return total_size_processed / num_entities_processed



In [32]:
################################
###  WITH MULTIPLE CLUSTERING
################################

wikidata_dump_path = './data/latest-all.json.bz2'
initial_estimated_average_size = 800
BATCH_SIZE = 100 # Number of entities to insert in a single batch
compressed_file_size = os.path.getsize(wikidata_dump_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

DATATYPES_MAPPINGS = {
    'external-id': 'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }

    return datatype in skip



def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value

global initial_total_lines_estimate


try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    food_subclass =  get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
    edInst_subclass =  get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
    govAgency_subclass =  get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
    intOrg_subclass =  get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
    timeZone_subclass =  get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])    
    geolocation_subclass = list(set(geolocation_subclass)-set(food_subclass))
    
    organization_subclass=get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])    
    country_subclass =  get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])    
    city_subclass =  get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])    
    capitals_subclass =  get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])

    admTerr_subclass =  get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])

    family_subclass =  get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
    sportLeague_subclass =  get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
    venue_subclass =  get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
    organization_subclass = list(set(organization_subclass)-set(country_subclass)-set(city_subclass)-
                             set(capitals_subclass))
except json.decoder.JSONDecodeError:
    pass


with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 1000
    
    ORG = []
    PERS = []
    LOC = []
    OTHERS = []

    pbar = tqdm(total=initial_total_lines_estimate)
    for i, line in enumerate(f):
        try:
            # Parse JSON data from each line
            item = json.loads(line[:-2])

            entity = item['id']
            labels = item.get("labels", {})
            english_label = labels.get("en", {}).get("value", "")
            aliases = item.get("aliases", {})
            description = item.get('descriptions', {}).get('en', {})
            category = "entity"
            sitelinks = item.get("sitelinks", {})
            popularity = len(sitelinks) if len(sitelinks) > 0 else 1

            
            if entity in list(mapping.values()):
                all_labels = {}
                for lang in labels:
                    all_labels[lang] = labels[lang]["value"]
            
                all_aliases = {}
                for lang in aliases:
                    all_aliases[lang] = []
                    for alias in aliases[lang]:
                        all_aliases[lang].append(alias["value"])
                    all_aliases[lang] = list(set(all_aliases[lang]))
            
                found = False
                for predicate in item["claims"]:
                    if predicate == "P279":
                        found = True
            
                if found:
                    category = "type"
                if entity[0] == "P":
                    category = "predicate"
        
                line_size = len(line)
                current_average_size = update_average_size(line_size)
                pbar.total = round(compressed_file_size / current_average_size)
                pbar.update(1)
    
                ###############################################################
                # ORGANIZATION EXTRACTION
                # All items with the root class Organization (Q43229) excluding country (Q6256), city (Q515), capitals (Q5119), 
                # administrative territorial entity of a single country (Q15916867), venue (Q17350442), sports league (Q623109) 
                # and family (Q8436)
                
                # LOCATION EXTRACTION
                # All items with the root class Geographic Location (Q2221906) excluding: food (Q2095), educational institution (Q2385804), 
                # government agency (Q327333), international organization (Q484652) and time zone (Q12143)
                
                # PERSON EXTRACTION
                # All items with the statement is instance of (P31) human (Q5) are classiﬁed as person.
    
                NERtype = None
    
                if item.get("type") == "item" and "claims" in item:
                    p31_claims = item["claims"].get("P31", [])
                               
                    for claim in p31_claims:
                        mainsnak = claim.get("mainsnak", {})
                        datavalue = mainsnak.get("datavalue", {})
                        numeric_id = datavalue.get("value", {}).get("numeric-id")
                        
                        if numeric_id == 5:
                            NERtype = "PERS" 
                        elif numeric_id in organization_subclass and numeric_id in geolocation_subclass:
                            NERtype = "LOC/ORG"
                        elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower() for k in ["district", "city", "country", "capital"]):
                            NERtype = "LOC"
                        elif numeric_id in organization_subclass:
                            NERtype = "ORG"  
                        else:
                            NERtype = "OTHERS"
                    print(f"{english_label} --> {NERtype}")
                    print("___________________")
                     
                ################################################################   
                ################################################################   
                # URL EXTRACTION
            
                try:
                    lang = labels.get("en", {}).get("language", "")
                    tmp={}
                    tmp["WD_id"] = item['id']
                    tmp["WP_id"] = labels.get("en", {}).get("value", "")
            
                    url_dict={}
                    url_dict["wikidata"] = "http://www.wikidata.org/wiki/"+tmp["WD_id"]
                    url_dict["wikipedia"] = "http://"+lang+".wikipedia.org/wiki/"+tmp["WP_id"].replace(" ","_")
                    url_dict["dbpedia"] = "http://dbpedia.org/resource/"+tmp["WP_id"].capitalize().replace(" ","_")
                    
            
                except json.decoder.JSONDecodeError:
                   pass
                
                ################################################################    
        
                objects = {}
                literals = {datatype: {} for datatype in DATATYPES}
                types = {"P31": []}
                join = {
                    "items": {
                        "id_entity": i,
                        "entity": entity,
                        "description": description,
                        "labels": all_labels,
                        "aliases": all_aliases,
                        "types": types,
                        "popularity": popularity,
                        "kind": category,   # kind (entity, type or predicate, disambiguation or category)
                        ######################
                        # new updates
                        "NERtype": NERtype, # (ORG, LOC, PER or OTHERS)
                        "URLs" : url_dict
                        ######################
                    },
                    "objects": { 
                        "id_entity": i,
                        "entity": entity,
                        "objects":objects
                    },
                    "literals": { 
                        "id_entity": i,
                        "entity": entity,
                        "literals": literals
                    },
                    "types": { 
                        "id_entity": i,
                        "entity": entity,
                        "types": types
                    },
                }
            
                predicates = item["claims"]
                for predicate in predicates:
                    for obj in predicates[predicate]:
                        datatype = obj["mainsnak"]["datatype"]
            
                        if check_skip(obj, datatype):
                            continue
            
                        if datatype == "wikibase-item" or datatype == "wikibase-property":
                            value = obj["mainsnak"]["datavalue"]["value"]["id"]
            
                            if predicate == "P31" or predicate == "P106":
                                types["P31"].append(value)
            
                            if value not in objects:
                                objects[value] = []
                            objects[value].append(predicate)    
                        else:
                            value = get_value(obj, datatype)                
                            lit = literals[DATATYPES_MAPPINGS[datatype]]
            
                            if predicate not in lit:
                                lit[predicate] = []
                            lit[predicate].append(value)   
            
                 
            
                for key in buffer:
                    buffer[key].append(join[key])            
            
                if len(buffer["items"]) == BATCH_SIZE:
                    flush_buffer(buffer)
    
        except json.decoder.JSONDecodeError:
            continue
    pbar.close()

  0%|          | 136/433964 [01:15<66:47:03,  1.80it/s]
  0%|          | 2/432781 [00:00<23:18:07,  5.16it/s]

Belgium --> LOC/ORG
___________________
happiness --> OTHERS
___________________


  0%|          | 4/433065 [00:00<21:09:30,  5.69it/s]

George Washington --> PERS
___________________
Jack Bauer --> OTHERS
___________________


  0%|          | 6/433271 [00:01<19:45:32,  6.09it/s]

Douglas Adams --> PERS
___________________
Paul Otlet --> PERS
___________________


  0%|          | 7/433234 [00:01<21:07:28,  5.70it/s]

Wikidata --> OTHERS
___________________


  0%|          | 9/432522 [00:01<21:36:18,  5.56it/s]

Portugal --> LOC/ORG
___________________
Antarctica --> LOC
___________________


  0%|          | 10/432799 [00:02<35:38:29,  3.37it/s]

Internet --> OTHERS
___________________


  0%|          | 11/433417 [00:02<39:38:33,  3.04it/s]

Supercalifragilisticexpialidocious --> OTHERS
___________________


  0%|          | 12/433520 [00:02<41:45:22,  2.88it/s]

lion --> OTHERS
___________________


  0%|          | 14/429711 [00:03<45:11:11,  2.64it/s]

People's Republic of China --> LOC/ORG
___________________
Brazil --> LOC/ORG
___________________


  0%|          | 16/430691 [00:04<32:00:52,  3.74it/s]

Yorkshire --> LOC
___________________
pizza --> OTHERS
___________________


  0%|          | 18/425393 [00:04<33:20:34,  3.54it/s]

Germany --> LOC/ORG
___________________
George W. Bush --> PERS
___________________


  0%|          | 20/425355 [00:05<33:04:41,  3.57it/s]

Malta --> OTHERS
___________________
Talisker distillery --> OTHERS
___________________


  0%|          | 22/425339 [00:05<26:05:08,  4.53it/s]

Tours --> LOC
___________________
Diego Velázquez --> PERS
___________________


  0%|          | 24/425043 [00:06<23:34:07,  5.01it/s]

Chile --> LOC/ORG
___________________
dictatorship --> OTHERS
___________________


  0%|          | 25/425360 [00:06<23:17:24,  5.07it/s]

English Wikipedia --> OTHERS
___________________


  0%|          | 26/425655 [00:06<29:39:29,  3.99it/s]

Augusto Pinochet --> PERS
___________________


  0%|          | 28/425592 [00:07<28:10:05,  4.20it/s]

Bahrain --> LOC
___________________


  0%|          | 29/426168 [00:07<27:00:45,  4.38it/s]

astrobiology --> OTHERS
___________________
Pioneer plaque --> OTHERS
___________________


  0%|          | 31/427211 [00:07<23:17:45,  5.09it/s]

zoology --> OTHERS
___________________
Gmina Kurów --> LOC/ORG
___________________


  0%|          | 33/428192 [00:08<27:28:39,  4.33it/s]

Encyclopédie --> OTHERS
___________________
Rhône-Alpes --> LOC/ORG
___________________


  0%|          | 34/428242 [00:08<34:47:10,  3.42it/s]

Charles Baudelaire --> PERS
___________________


  0%|          | 36/429077 [00:09<33:22:48,  3.57it/s]

Museum of Fine Arts of Lyon --> LOC/ORG
___________________
anatomy --> OTHERS
___________________


  0%|          | 37/429528 [00:09<28:53:22,  4.13it/s]

Mount Vesuvius --> LOC
___________________


  0%|          | 39/430180 [00:10<30:30:45,  3.92it/s]

avenue des Champs-Élysées --> LOC
___________________
hydrogen --> OTHERS
___________________


  0%|          | 41/430286 [00:10<24:14:44,  4.93it/s]

Oslo --> LOC
___________________
Bonn --> LOC/ORG
___________________


  0%|          | 43/431061 [00:10<22:52:41,  5.23it/s]

All Saints' Day --> OTHERS
___________________
The Intouchables --> OTHERS
___________________


  0%|          | 45/431754 [00:10<19:46:42,  6.06it/s]

Lisbon --> LOC
___________________
Beaujolais wine --> OTHERS
___________________


  0%|          | 47/431687 [00:11<19:41:04,  6.09it/s]

Nicolaus Copernicus --> PERS
___________________
Neil Young --> PERS
___________________


  0%|          | 48/432024 [00:11<20:40:40,  5.80it/s]

planet --> OTHERS
___________________


  0%|          | 50/432584 [00:12<25:57:19,  4.63it/s]

Po --> LOC
___________________


  0%|          | 51/432726 [00:12<25:28:20,  4.72it/s]

Rennes --> LOC
___________________
Lille --> LOC
___________________


  0%|          | 53/432653 [00:13<38:08:49,  3.15it/s]

Vanuatu --> LOC/ORG
___________________
chlorine --> OTHERS
___________________


  0%|          | 55/433183 [00:13<31:05:23,  3.87it/s]

South Holland --> LOC/ORG
___________________
titanium --> OTHERS
___________________


  0%|          | 56/433507 [00:13<30:36:37,  3.93it/s]

vanadium --> OTHERS
___________________


  0%|          | 58/434068 [00:14<34:17:49,  3.52it/s]

Pierre Corneille --> PERS
___________________
Groningen --> LOC
___________________


  0%|          | 60/434463 [00:14<28:10:12,  4.28it/s]

fungus --> OTHERS
___________________
Massachusetts --> LOC/ORG
___________________


  0%|          | 62/433256 [00:15<23:28:32,  5.13it/s]

Israel --> LOC
___________________
Lausanne --> LOC
___________________


  0%|          | 64/433128 [00:15<20:58:56,  5.73it/s]

Gabriel Gonzáles Videla --> PERS
___________________
Laos --> LOC
___________________


  0%|          | 66/433797 [00:15<20:58:31,  5.74it/s]

James Bond --> OTHERS
___________________
Andrei Tarkovsky --> PERS
___________________


  0%|          | 68/433279 [00:16<19:30:39,  6.17it/s]

Plato --> PERS
___________________
Tajikistan --> LOC
___________________


  0%|          | 70/432291 [00:16<23:23:09,  5.13it/s]

Thailand --> LOC
___________________
Meryl Streep --> PERS
___________________


  0%|          | 72/431989 [00:17<23:29:09,  5.11it/s]

United Arab Emirates --> LOC
___________________
platinum --> OTHERS
___________________


  0%|          | 74/432231 [00:17<23:22:58,  5.13it/s]

Novosibirsk --> LOC
___________________
Nizhny Novgorod --> LOC/ORG
___________________


  0%|          | 76/432839 [00:17<22:56:01,  5.24it/s]

Omsk --> LOC
___________________
Suez Canal --> LOC
___________________


  0%|          | 77/433414 [00:18<29:57:40,  4.02it/s]

Erta Ale --> LOC
___________________


  0%|          | 79/432196 [00:18<31:39:45,  3.79it/s]

Mali --> LOC
___________________
Angola --> LOC
___________________


  0%|          | 81/433174 [00:19<27:41:11,  4.35it/s]

Brač --> LOC
___________________


  0%|          | 82/433346 [00:19<26:23:09,  4.56it/s]

yellow --> OTHERS
___________________
Donald Tusk --> PERS
___________________


  0%|          | 84/434432 [00:19<24:35:48,  4.91it/s]

toilet paper orientation --> OTHERS
___________________
Reggiolo --> LOC/ORG
___________________


  0%|          | 86/435454 [00:20<28:39:17,  4.22it/s]

More Than Life at Stake --> OTHERS
___________________
Hermann Brunner --> OTHERS
___________________


  0%|          | 88/436196 [00:20<23:03:28,  5.25it/s]

Warburg --> LOC/ORG
___________________
Ob --> LOC
___________________


  0%|          | 90/437186 [00:21<28:04:30,  4.32it/s]

FIS Alpine Ski World Cup --> OTHERS
___________________
Czterej pancerni i pies --> OTHERS
___________________


  0%|          | 92/436790 [00:21<24:26:05,  4.96it/s]

Andrei Sakharov --> PERS
___________________
Sierra Leone --> LOC
___________________


  0%|          | 94/436518 [00:21<22:00:09,  5.51it/s]

Sudan --> LOC/ORG
___________________
Italo Balbo --> PERS
___________________


  0%|          | 96/436638 [00:22<27:03:08,  4.48it/s]

Narendra Modi --> PERS
___________________
geography --> OTHERS
___________________


  0%|          | 98/437275 [00:22<24:15:58,  5.00it/s]

Star Trek --> OTHERS
___________________
Limburg --> LOC/ORG
___________________


  0%|          | 100/438069 [00:23<23:09:55,  5.25it/s]

antimony --> OTHERS
___________________
unbinilium --> OTHERS
___________________


  0%|          | 101/438199 [00:23<25:18:20,  4.81it/s]

Indira Gandhi --> PERS
___________________


  0%|          | 102/438234 [00:23<25:26:29,  4.78it/s]

Hector Berlioz --> PERS
___________________


  0%|          | 103/438800 [00:23<31:55:55,  3.82it/s]

Groß Borstel --> LOC/ORG
___________________


  0%|          | 105/438842 [00:24<32:50:18,  3.71it/s]

Puerto Rico --> LOC
___________________
Saarland --> LOC/ORG
___________________


  0%|          | 107/439063 [00:24<27:48:24,  4.38it/s]

Illinois --> LOC/ORG
___________________
Oscar Luigi Scalfaro --> PERS
___________________


  0%|          | 109/440067 [00:25<25:39:35,  4.76it/s]

dubnium --> OTHERS
___________________
Cottian Alps --> LOC
___________________


  0%|          | 110/440291 [00:25<24:42:12,  4.95it/s]

Ban Ki-moon --> PERS
___________________


  0%|          | 112/440786 [00:26<26:43:46,  4.58it/s]

Kofi Annan --> PERS
___________________
meitnerium --> OTHERS
___________________


  0%|          | 114/441123 [00:26<30:41:55,  3.99it/s]

Pennine Alps --> LOC
___________________
Leonard Cohen --> PERS
___________________


  0%|          | 115/441674 [00:26<28:52:49,  4.25it/s]

Rhaetian Alps --> LOC
___________________


  0%|          | 117/441816 [00:27<30:02:28,  4.08it/s]

Chicago --> LOC
___________________
nihonium --> OTHERS
___________________


  0%|          | 119/442040 [00:27<25:11:24,  4.87it/s]

Osama bin Laden --> PERS
___________________
Friedrich Hayek --> PERS
___________________


  0%|          | 121/442892 [00:28<23:08:23,  5.32it/s]

José Joaquín Prieto --> PERS
___________________
Arachnida --> OTHERS
___________________


  0%|          | 123/443803 [00:28<21:22:04,  5.77it/s]

Tripura --> LOC/ORG
___________________
Dave Arneson --> PERS
___________________


  0%|          | 125/444582 [00:28<23:23:46,  5.28it/s]

Uetersen --> LOC/ORG
___________________
Otho --> PERS
___________________


  0%|          | 127/445362 [00:29<24:02:43,  5.14it/s]

Titus --> PERS
___________________


  0%|          | 128/445792 [00:29<22:32:05,  5.49it/s]

field hockey --> OTHERS
___________________
Eschwege --> LOC/ORG
___________________


  0%|          | 130/446625 [00:29<24:06:33,  5.14it/s]

Loire --> LOC
___________________
hacker --> OTHERS
___________________


  0%|          | 131/446650 [00:29<24:47:10,  5.00it/s]

Mexico City --> LOC
___________________


  0%|          | 133/447063 [00:30<24:25:34,  5.08it/s]

Mississippi --> LOC/ORG
___________________
Uttarakhand --> LOC/ORG
___________________


  0%|          | 134/447530 [00:30<25:15:21,  4.92it/s]

Cublize --> LOC/ORG
___________________


  0%|          | 136/447863 [00:30<24:53:34,  5.00it/s]

Richard Wagner --> PERS
___________________
Nagpur --> LOC
___________________


  0%|          | 137/448368 [00:31<22:53:01,  5.44it/s]

Lyon Cathedral --> LOC
___________________


  0%|          | 138/448409 [00:31<25:33:48,  4.87it/s]

New Mexico --> LOC/ORG
___________________


  0%|          | 140/449168 [00:31<28:42:15,  4.35it/s]

Ramesses I --> PERS
___________________
Caracas --> LOC
___________________


  0%|          | 142/450214 [00:32<26:00:58,  4.81it/s]

Finkenwerder --> LOC/ORG
___________________
Billstedt --> LOC/ORG
___________________


  0%|          | 144/451238 [00:32<24:58:30,  5.02it/s]

Avast Antivirus --> OTHERS
___________________
Blankenese --> LOC/ORG
___________________


  0%|          | 146/452025 [00:33<23:25:21,  5.36it/s]

Eimsbüttel --> LOC/ORG
___________________
Valenzuela --> LOC
___________________


  0%|          | 148/452801 [00:33<23:01:45,  5.46it/s]

Antipolo --> LOC
___________________
Primo Nebiolo --> PERS
___________________


  0%|          | 149/453326 [00:33<23:30:39,  5.35it/s]

Bergstedt --> LOC/ORG
___________________


  0%|          | 151/453639 [00:33<24:05:59,  5.23it/s]

Elbe --> LOC
___________________
Dejen Gebremeskel --> PERS
___________________


  0%|          | 153/454599 [00:34<22:17:51,  5.66it/s]

Neckar --> LOC
___________________
Abel Mutai --> PERS
___________________


  0%|          | 154/454831 [00:34<21:46:05,  5.80it/s]

North Sea --> LOC
___________________


  0%|          | 156/455352 [00:34<22:43:27,  5.56it/s]

Tabaco --> LOC
___________________
Kiel --> LOC
___________________


  0%|          | 157/455480 [00:35<24:22:27,  5.19it/s]

Potsdam --> LOC
___________________


  0%|          | 158/455990 [00:35<26:16:44,  4.82it/s]

Erick Barrondo --> PERS
___________________


  0%|          | 160/456063 [00:35<25:17:16,  5.01it/s]

Düsseldorf --> LOC
___________________
Wiesbaden --> LOC/ORG
___________________


  0%|          | 162/456770 [00:36<23:43:29,  5.35it/s]

Saarbrücken --> LOC
___________________
Robert Grabarz --> PERS
___________________


  0%|          | 163/457262 [00:36<22:55:32,  5.54it/s]

Samara Governorate --> LOC
___________________


  0%|          | 164/457547 [00:36<26:14:32,  4.84it/s]

Kaliningrad Oblast --> LOC
___________________


  0%|          | 166/457872 [00:36<25:53:19,  4.91it/s]

Tomasz Majewski --> PERS
___________________
Helsinki --> LOC
___________________


  0%|          | 168/458684 [00:37<26:39:54,  4.78it/s]

Murmansk Oblast --> LOC
___________________
Ashton Eaton --> PERS
___________________


  0%|          | 170/459616 [00:37<26:06:49,  4.89it/s]

Eilbek --> LOC/ORG
___________________
Gordian III --> PERS
___________________


  0%|          | 172/460659 [00:38<25:39:11,  4.99it/s]

Poppenbüttel --> LOC/ORG
___________________
Rahlstedt --> LOC/ORG
___________________


  0%|          | 173/460838 [00:38<25:48:24,  4.96it/s]

gadolinium --> OTHERS
___________________


  0%|          | 175/461011 [00:38<26:34:13,  4.82it/s]

Wohldorf-Ohlstedt --> LOC/ORG
___________________
English --> OTHERS
___________________


  0%|          | 176/461428 [00:38<27:55:35,  4.59it/s]

Bangka Belitung Islands --> LOC
___________________


  0%|          | 178/462389 [00:39<27:02:49,  4.75it/s]

Diplomacy --> OTHERS
___________________
Bengkulu --> LOC
___________________


  0%|          | 179/462720 [00:39<26:29:39,  4.85it/s]

Olivier Giroud --> PERS
___________________


  0%|          | 181/463590 [00:40<28:20:12,  4.54it/s]

Yann M'Vila --> PERS
___________________
Sidney Govou --> PERS
___________________


  0%|          | 183/464316 [00:40<26:21:33,  4.89it/s]

Djibril Cissé --> PERS
___________________
Bacary Sagna --> PERS
___________________


  0%|          | 184/464775 [00:40<35:34:16,  3.63it/s]

Hermann Maier --> PERS
___________________


  0%|          | 185/465174 [00:41<35:26:21,  3.64it/s]

Givors --> LOC/ORG
___________________


  0%|          | 186/465624 [00:41<33:22:01,  3.87it/s]

Rivolet --> LOC/ORG
___________________


  0%|          | 187/466077 [00:41<31:58:36,  4.05it/s]

2002 --> OTHERS
___________________


  0%|          | 189/466683 [00:42<28:13:50,  4.59it/s]

2009 --> OTHERS
___________________
Newfoundland and Labrador --> LOC/ORG
___________________


  0%|          | 191/467642 [00:42<25:12:12,  5.15it/s]

Gare de Lyon-Vaise --> OTHERS
___________________
2005 --> OTHERS
___________________


  0%|          | 193/468646 [00:42<26:27:32,  4.92it/s]

Chambost-Allières --> LOC/ORG
___________________
history of Lyon --> OTHERS
___________________


  0%|          | 194/469107 [00:43<42:10:27,  3.09it/s]

1907 --> OTHERS
___________________


  0%|          | 196/469900 [00:43<33:29:13,  3.90it/s]

1906 --> OTHERS
___________________
ACF Fiorentina --> ORG
___________________


  0%|          | 198/470885 [00:44<27:17:38,  4.79it/s]

1909 --> OTHERS
___________________
Gare de Lyon-Gorge-de-Loup --> OTHERS
___________________


  0%|          | 200/471784 [00:44<25:43:19,  5.09it/s]

1993 --> OTHERS
___________________
1996 --> OTHERS
___________________


  0%|          | 202/472612 [00:44<27:14:59,  4.82it/s]

Komi Republic --> LOC/ORG
___________________
1912 --> OTHERS
___________________


  0%|          | 204/473163 [00:45<26:19:33,  4.99it/s]

1999 --> OTHERS
___________________
Edmonton --> LOC
___________________


  0%|          | 206/473200 [00:45<25:59:19,  5.06it/s]

Duisburg --> LOC/ORG
___________________
Jacques Chirac --> PERS
___________________


  0%|          | 207/473652 [00:45<27:05:43,  4.85it/s]

Arica y Parinacota Region --> LOC
___________________


  0%|          | 208/474096 [00:46<29:10:29,  4.51it/s]

Tarapacá Region --> LOC
___________________


  0%|          | 209/474037 [00:46<35:37:48,  3.69it/s]

Mannheim --> LOC/ORG
___________________


  0%|          | 210/474325 [00:46<38:28:45,  3.42it/s]

Victoria --> LOC
___________________


  0%|          | 211/474777 [00:47<43:24:15,  3.04it/s]

Totma --> LOC
___________________


  0%|          | 212/475147 [00:47<44:36:53,  2.96it/s]

Goku --> OTHERS
___________________


  0%|          | 213/475685 [00:48<47:08:09,  2.80it/s]

Government of the Soviet Union --> ORG
___________________


  0%|          | 214/476086 [00:48<58:41:56,  2.25it/s]

January 2 --> OTHERS
___________________


  0%|          | 215/476538 [00:48<51:27:24,  2.57it/s]

1922 --> OTHERS
___________________


  0%|          | 217/477095 [00:49<38:19:04,  3.46it/s]

Naryan-Mar --> LOC
___________________
Georges Pompidou --> PERS
___________________


  0%|          | 218/477416 [00:49<33:27:53,  3.96it/s]

Coluche --> PERS
___________________


  0%|          | 219/477864 [00:49<32:42:32,  4.06it/s]

GIF --> OTHERS
___________________


  0%|          | 220/478167 [00:50<31:21:17,  4.23it/s]

Iriga --> LOC
___________________


  0%|          | 222/478970 [00:50<44:43:48,  2.97it/s]

Naga --> LOC
___________________
Alto Hospicio --> ORG
___________________


  0%|          | 223/479265 [00:51<46:16:43,  2.88it/s]

Nissedal --> LOC/ORG
___________________


  0%|          | 224/479660 [00:51<56:02:47,  2.38it/s]

Langenzenn --> LOC/ORG
___________________


  0%|          | 225/479942 [00:52<56:27:43,  2.36it/s]

Seljord --> LOC/ORG
___________________


  0%|          | 226/480351 [00:53<75:11:28,  1.77it/s]

January 9 --> OTHERS
___________________


  0%|          | 227/480868 [00:53<74:05:38,  1.80it/s]

Kongens Lyngby --> LOC
___________________


  0%|          | 228/481163 [00:53<66:16:58,  2.02it/s]

Tokke Municipality --> LOC/ORG
___________________


  0%|          | 229/481568 [00:54<61:11:50,  2.18it/s]

January 10 --> OTHERS
___________________


  0%|          | 230/481976 [00:54<58:57:30,  2.27it/s]

January 13 --> OTHERS
___________________


  0%|          | 231/482386 [00:55<58:21:14,  2.30it/s]

January 17 --> OTHERS
___________________


  0%|          | 232/482586 [00:55<52:42:07,  2.54it/s]

Skien --> LOC/ORG
___________________


  0%|          | 233/482995 [00:56<61:52:36,  2.17it/s]

January 23 --> OTHERS
___________________


  0%|          | 234/483420 [00:56<62:11:01,  2.16it/s]

Sauron --> OTHERS
___________________


  0%|          | 235/483553 [00:57<61:09:31,  2.20it/s]

proton --> OTHERS
___________________


  0%|          | 236/483971 [00:57<61:13:03,  2.19it/s]

December 8 --> OTHERS
___________________


  0%|          | 238/484784 [00:58<52:19:42,  2.57it/s]

February 5 --> OTHERS
___________________
February 12 --> OTHERS
___________________


  0%|          | 239/484901 [00:58<51:26:10,  2.62it/s]

12 Angry Men --> OTHERS
___________________


  0%|          | 240/485306 [00:58<53:17:56,  2.53it/s]

February 28 --> OTHERS
___________________


  0%|          | 241/485823 [00:59<60:54:20,  2.21it/s]

Altengamme --> LOC/ORG
___________________


  0%|          | 242/486243 [00:59<58:00:12,  2.33it/s]

March 4 --> OTHERS
___________________


  0%|          | 243/486660 [01:00<52:30:12,  2.57it/s]

March 10 --> OTHERS
___________________


  0%|          | 244/487194 [01:00<46:09:25,  2.93it/s]

¿Dónde Están Corazón? --> OTHERS
___________________


  0%|          | 245/487718 [01:00<41:22:29,  3.27it/s]

Reitbrook --> LOC/ORG
___________________


  0%|          | 246/487982 [01:00<38:25:42,  3.53it/s]

Yaroslavl --> LOC
___________________


  0%|          | 247/488397 [01:01<38:57:00,  3.48it/s]

March 19 --> OTHERS
___________________


  0%|          | 248/488810 [01:01<39:43:03,  3.42it/s]

March 22 --> OTHERS
___________________


  0%|          | 249/489283 [01:01<39:41:20,  3.42it/s]

Heroes of Might and Magic V: Hammers of Fate --> OTHERS
___________________


  0%|          | 250/489594 [01:01<37:08:36,  3.66it/s]

Sanaa --> LOC
___________________


  0%|          | 251/490027 [01:02<37:03:54,  3.67it/s]

1970 --> OTHERS
___________________


  0%|          | 252/490457 [01:02<36:52:54,  3.69it/s]

1971 --> OTHERS
___________________


  0%|          | 253/490892 [01:02<36:38:11,  3.72it/s]

1973 --> OTHERS
___________________


  0%|          | 254/491240 [01:03<39:23:05,  3.46it/s]

Thirty Years' War --> OTHERS
___________________


  0%|          | 255/491354 [01:03<38:45:18,  3.52it/s]

Ludwig Erhard --> PERS
___________________


  0%|          | 256/491768 [01:03<39:16:32,  3.48it/s]

April 13 --> OTHERS
___________________


  0%|          | 257/492298 [01:04<45:03:12,  3.03it/s]

Kick-Ass --> OTHERS
___________________


KeyboardInterrupt: 

In [None]:
################################
###  WITH SINGLE CLUSTERING
################################

wikidata_dump_path = './my-data/latest-all.json.bz2'
initial_estimated_average_size = 800
BATCH_SIZE = 100 # Number of entities to insert in a single batch
compressed_file_size = os.path.getsize(wikidata_dump_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

DATATYPES_MAPPINGS = {
    'external-id': 'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }

    return datatype in skip


def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value

global initial_total_lines_estimate

try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = set()
    pass

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = set()
    pass

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = set()
    pass

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = set()
    pass

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = set()
    pass

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = set()
    pass

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = set()
    pass
    
try:
    organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass


try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = set()
    pass

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = set()
    pass

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = set()
    pass

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = set()
    pass

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = set()
    pass
   
try:
    geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 1000
    
    ORG = []
    PERS = []
    LOC = []
    OTHERS = []

    pbar = tqdm(total=initial_total_lines_estimate)
    for i, line in enumerate(f):
        try:
            # Parse JSON data from each line
            item = json.loads(line[:-2])

            entity = item['id']
            labels = item.get("labels", {})
            english_label = labels.get("en", {}).get("value", "")
            aliases = item.get("aliases", {})
            description = item.get('descriptions', {}).get('en', {})
            category = "entity"
            sitelinks = item.get("sitelinks", {})
            popularity = len(sitelinks) if len(sitelinks) > 0 else 1

            
            if entity in list(mapping.values()):
                all_labels = {}
                for lang in labels:
                    all_labels[lang] = labels[lang]["value"]
            
                all_aliases = {}
                for lang in aliases:
                    all_aliases[lang] = []
                    for alias in aliases[lang]:
                        all_aliases[lang].append(alias["value"])
                    all_aliases[lang] = list(set(all_aliases[lang]))
            
                found = False
                for predicate in item["claims"]:
                    if predicate == "P279":
                        found = True
            
                if found:
                    category = "type"
                if entity[0] == "P":
                    category = "predicate"
        
                line_size = len(line)
                current_average_size = update_average_size(line_size)
                pbar.total = round(compressed_file_size / current_average_size)
                pbar.update(1)
    
                ###############################################################
                # ORGANIZATION EXTRACTION
                # All items with the root class Organization (Q43229) excluding country (Q6256), city (Q515), capitals (Q5119), 
                # administrative territorial entity of a single country (Q15916867), venue (Q17350442), sports league (Q623109) 
                # and family (Q8436)
                
                # LOCATION EXTRACTION
                # All items with the root class Geographic Location (Q2221906) excluding: food (Q2095), educational institution (Q2385804), 
                # government agency (Q327333), international organization (Q484652) and time zone (Q12143)
                
                # PERSON EXTRACTION
                # All items with the statement is instance of (P31) human (Q5) are classiﬁed as person.
    
                NERtype = None
    
                if item.get("type") == "item" and "claims" in item:
                    p31_claims = item["claims"].get("P31", [])
                    
                    if len(p31_claims) != 0:           
                        for claim in p31_claims:
                            mainsnak = claim.get("mainsnak", {})
                            datavalue = mainsnak.get("datavalue", {})
                            numeric_id = datavalue.get("value", {}).get("numeric-id")
                            
                            if numeric_id == 5:
                                NERtype = "PERS" 
                            elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower() for k in ["district", "city", "country", "capital"]):
                                NERtype = "LOC"
                            elif numeric_id in organization_subclass:
                                NERtype = "ORG"  
                            else:
                                NERtype = "OTHERS"
                    else:
                        NERtype = "OTHERS" 
                        
                ################################################################   
                ################################################################   
                # URL EXTRACTION
            
                try:
                    lang = labels.get("en", {}).get("language", "")
                    tmp={}
                    tmp["WD_id"] = item['id']
                    tmp["WP_id"] = labels.get("en", {}).get("value", "")
            
                    url_dict={}
                    url_dict["wikidata"] = "http://www.wikidata.org/wiki/"+tmp["WD_id"]
                    url_dict["wikipedia"] = "http://"+lang+".wikipedia.org/wiki/"+tmp["WP_id"].replace(" ","_")
                    url_dict["dbpedia"] = "http://dbpedia.org/resource/"+tmp["WP_id"].capitalize().replace(" ","_")
                    
            
                except json.decoder.JSONDecodeError:
                   pass
                
                ################################################################    
        
                objects = {}
                literals = {datatype: {} for datatype in DATATYPES}
                types = {"P31": []}
                join = {
                    "items": {
                        "id_entity": i,
                        "entity": entity,
                        "description": description,
                        "labels": all_labels,
                        "aliases": all_aliases,
                        "types": types,
                        "popularity": popularity,
                        "kind": category,   # kind (entity, type or predicate, disambiguation or category)
                        ######################
                        # new updates
                        "NERtype": NERtype, # (ORG, LOC, PER or OTHERS)
                        "URLs" : url_dict
                        ######################
                    },
                    "objects": { 
                        "id_entity": i,
                        "entity": entity,
                        "objects":objects
                    },
                    "literals": { 
                        "id_entity": i,
                        "entity": entity,
                        "literals": literals
                    },
                    "types": { 
                        "id_entity": i,
                        "entity": entity,
                        "types": types
                    },
                }
            
                predicates = item["claims"]
                for predicate in predicates:
                    for obj in predicates[predicate]:
                        datatype = obj["mainsnak"]["datatype"]
            
                        if check_skip(obj, datatype):
                            continue
            
                        if datatype == "wikibase-item" or datatype == "wikibase-property":
                            value = obj["mainsnak"]["datavalue"]["value"]["id"]
            
                            if predicate == "P31" or predicate == "P106":
                                types["P31"].append(value)
            
                            if value not in objects:
                                objects[value] = []
                            objects[value].append(predicate)    
                        else:
                            value = get_value(obj, datatype)                
                            lit = literals[DATATYPES_MAPPINGS[datatype]]
            
                            if predicate not in lit:
                                lit[predicate] = []
                            lit[predicate].append(value)   
            
                 
            
                for key in buffer:
                    buffer[key].append(join[key])            
            
                if len(buffer["items"]) == BATCH_SIZE:
                    flush_buffer(buffer)
    
        except json.decoder.JSONDecodeError:
            continue
    pbar.close()

  1%|          | 25608/2214523 [1:58:22<308:07:32,  1.97it/s] 

In [None]:
json_file_path = "./yago_wiki_classification.json"

data = {
    "ORG": ORG,
    "LOC": LOC,
    "PERS": PERS,
    "OTHERS": OTHERS
}

# Write the categorized data to a JSON file
try:
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Data saved successfully to {json_file_path}")
except Exception as e:
    print(f"Error saving data to JSON file: {e}")

In [None]:
logs = parser.parse_all_logs(log_dir="./")
first_log = logs[0]

print(f"Output file name: {first_log['output_filename']}")
print(f"Standard file name: {first_log['standard_filename']}")
print(f"Stopped early: {first_log['early_stop']}")
print(f"Measured consumption: {first_log['actual']}")
print(f"Predicted consumption: {first_log['pred']}")
print(f"Measured GPU devices: {first_log['components']['gpu']['devices']}")

In [None]:
total_length_PERS = len(PERS)
total_length_ORG = len(ORG)
total_length_LOC = len(LOC)
total_length_OTHERS = len(OTHERS)

# Print the total lengths
print("Total lengths:")
print(f"Length of PERS: {total_length_PERS}")
print(f"Length of ORG: {total_length_ORG}")
print(f"Length of LOC: {total_length_LOC}")
print(f"Length of OTHERS: {total_length_OTHERS}")

# Calculate the sum of lengths
total_length = total_length_PERS + total_length_ORG + total_length_LOC + total_length_OTHERS

# Print the sum of lengths
print(f"Total length: {total_length}")

In [None]:
for el in OTHERS:
    if el in PERS:
        print(f"PERS and ORG --> Entity ID: {PERS.index(el)}")
    if el in LOC:
        print(f"LOC and ORG --> Entity ID: {LOC.index(el)}")
    if el in ORG:
        print(f"OTHERS and ORG --> Entity ID: {ORG.index(el)}")

In [None]:
# Convert lists to sets for faster intersection operation
ORG_set = set(ORG)
PERS_set = set(PERS)
LOC_set = set(LOC)
OTHERS_set = set(OTHERS)

# Initialize counters for each set
ORG_counter = 0
PERS_counter = 0
LOC_counter = 0
OTHERS_counter = 0

# Find the overlapping items and update the counters
for item in ORG_set.union(PERS_set, LOC_set, OTHERS_set):
    num_overlaps = 0
    if item in ORG_set:
        print("item")
        num_overlaps += 1
    if item in PERS_set:
        num_overlaps += 1
    if item in LOC_set:
        num_overlaps += 1
    if item in OTHERS_set:
        num_overlaps += 1
    
    # Update the corresponding counter based on the number of overlaps
    if num_overlaps == 1:
        ORG_counter += 1
    elif num_overlaps == 2:
        PERS_counter += 1
    elif num_overlaps == 3:
        LOC_counter += 1
    elif num_overlaps == 4:
        OTHERS_counter += 1

# Print the counts for each set
print("Number of overlaps for each set:")
print(f"ORG: {ORG_counter}")
print(f"PERS: {PERS_counter}")
print(f"LOC: {LOC_counter}")
print(f"OTHERS: {OTHERS_counter}")

## URL Construction

In [None]:
#! /usr/bin/env python3
# This Python file uses the following encoding: utf-8

__author__ = 'jgeiss'


#############################################################################
# authors: Johanna Geiß, Heidelberg University, Germany                     #
# email: geiss@informatik.uni-heidelberg.de                                 #
# Copyright (c) 2017 Database Research Group,                               #
#               Institute of Computer Science,                              #
#               University of Heidelberg                                    #
#   Licensed under the Apache License, Version 2.0 (the "License");         #
#   you may not use this file except in compliance with the License.        #
#   You may obtain a copy of the License at                                 #
#                                                                           #
#   http://www.apache.org/licenses/LICENSE-2.0                              #
#                                                                           #
#   Unless required by applicable law or agreed to in writing, software     #
#   distributed under the License is distributed on an "AS IS" BASIS,       #
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.#
#   See the License for the specific language governing permissions and     #
#   limitations under the License.                                          #
#############################################################################
# last updated 21.3.2017 by Johanna Geiß

from pymongo import *
from pymongo import errors
import configparser



wikidata_dump_path = './my-data/latest-all.json.bz2'

with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 0
    
             
    for i, line in tqdm(enumerate(f), total=1000):
        if count == 10000:
            break
        try:
            count += 1
            # Parse JSON data from each line
            data = json.loads(line[:-2])
         
            labels = data.get("labels", {})
            lang = labels.get("en", {}).get("language", "")
            entry={}
            entry["WD_id"] = data['id']
            entry["WP_id"] = labels.get("en", {}).get("value", "")

            entry["WD_id_URL"] = "http://www.wikidata.org/wiki/"+entry["WD_id"]
            entry["WP_id_URL"] = "http://"+lang+".wikipedia.org/wiki/"+entry["WP_id"].replace(" ","_")
            entry["dbpedia_URL"] = "http://dbpedia.org/resource/"+entry["WP_id"].capitalize().replace(" ","_")
            
            print("------------------")
            print(entry["WD_id_URL"])
            print(entry["WP_id_URL"])
            print(entry["dbpedia_URL"])
            print("------------------")
    
        except json.decoder.JSONDecodeError:
            continue







In [None]:
from carbontracker import parser

logs = parser.parse_all_logs(log_dir="./")
print(logs)
first_log = logs[0]

print(f"Output file name: {first_log['output_filename']}")
print(f"Standard file name: {first_log['standard_filename']}")
print(f"Stopped early: {first_log['early_stop']}")
print(f"Measured consumption: {first_log['actual']}")
print(f"Predicted consumption: {first_log['pred']}")
print(f"Measured GPU devices: {first_log['components']['gpu']['devices']}")