In [1]:
! pip install SPARQLWrapper



In [2]:
import bz2
import json
from tqdm import tqdm
import traceback
import os
from pymongo import MongoClient
from pymongo import *
from pymongo import errors
import configparser
from json.decoder import JSONDecodeError
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON
import time
from requests import get

In [3]:
# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
DB_NAME = f"wikidata"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
print(client)

log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True)


In [4]:
def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []

def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids

In [5]:
json_file_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/def_mapping.json"

try:
    # Open the JSON file for reading
    with open(json_file_path, 'r') as json_file:
        mapping = json.load(json_file)
        
except FileNotFoundError:
    print(f"Error: File '{json_file_path}' not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON data: {e}")
except Exception as e:
    print(f"Error loading data from JSON file: {e}")

In [6]:
total_size_processed = 0
num_entities_processed = 0

def update_average_size(new_size):
    global total_size_processed, num_entities_processed
    total_size_processed += new_size
    num_entities_processed += 1
    return total_size_processed / num_entities_processed



In [9]:
def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[label] = "Q"+(superclass_id[1:])
        return list(superclass_dict.values())
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [10]:
wikidata_dump_path = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/latest-all.json.bz2'
initial_estimated_average_size = 800
BATCH_SIZE = 100 # Number of entities to insert in a single batch
compressed_file_size = os.path.getsize(wikidata_dump_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

DATATYPES_MAPPINGS = {
    'external-id': 'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }

    return datatype in skip


def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value

global initial_total_lines_estimate

try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
except json.decoder.JSONDecodeError:
    organization_subclass = []

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = []

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = []

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = []

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = []

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = []

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = []

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = []

# Removing overlaps for organization_subclass
organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))

try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
except json.decoder.JSONDecodeError:
    geolocation_subclass = []

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = []

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = []

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = []

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = []

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = []

# Removing overlaps for geolocation_subclass
geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))



with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 1000
    
    ORG = []
    PERS = []
    LOC = []
    OTHERS = []

    pbar = tqdm(total=initial_total_lines_estimate)

    for i, line in enumerate(f):
        try:
            # Parse JSON data from each line
            item = json.loads(line[:-2])

            entity = item['id']
            labels = item.get("labels", {})
            english_label = labels.get("en", {}).get("value", "")
            aliases = item.get("aliases", {})
            description = item.get('descriptions', {}).get('en', {})
            category = "entity"
            sitelinks = item.get("sitelinks", {})
            popularity = len(sitelinks) if len(sitelinks) > 0 else 1
   
            print(f"{english_label} - {entity}")

            if entity in list(mapping.values()):
                all_labels = {}
                for lang in labels:
                    all_labels[lang] = labels[lang]["value"]
            
                all_aliases = {}
                for lang in aliases:
                    all_aliases[lang] = []
                    for alias in aliases[lang]:
                        all_aliases[lang].append(alias["value"])
                    all_aliases[lang] = list(set(all_aliases[lang]))
            
                for predicate in item["claims"]:
                    if predicate == "P279":
                        category = "type"
                        break
                    if predicate == "P31":
                        if 'Q4167410' == entity:
                            category = "disambiguation"
                            break
                        elif 'Q4167836' == entity:
                            category = "category"
                            break

                if entity[0] == "P":
                    category = "predicate"
        
                line_size = len(line)
                current_average_size = update_average_size(line_size)
                pbar.total = round(compressed_file_size / current_average_size)
                pbar.update(1)
    
                ###############################################################
                # ORGANIZATION EXTRACTION
                # All items with the root class Organization (Q43229) excluding country (Q6256), city (Q515), capitals (Q5119), 
                # administrative territorial entity of a single country (Q15916867), venue (Q17350442), sports league (Q623109) 
                # and family (Q8436)
                
                # LOCATION EXTRACTION
                # All items with the root class Geographic Location (Q2221906) excluding: food (Q2095), educational institution (Q2385804), 
                # government agency (Q327333), international organization (Q484652) and time zone (Q12143)
                
                # PERSON EXTRACTION
                # All items with the statement is instance of (P31) human (Q5) are classiﬁed as person.
    
                NERtype = None
    
                if item.get("type") == "item" and "claims" in item:
                    p31_claims = item["claims"].get("P31", [])
                    ner_counter = Counter()
                    
                    if len(p31_claims) != 0:           
                        for claim in p31_claims:
                            mainsnak = claim.get("mainsnak", {})
                            datavalue = mainsnak.get("datavalue", {})
                            numeric_id = datavalue.get("value", {}).get("numeric-id")
                            
                            # Classify NER types
                            if numeric_id == 5:
                                ner_counter['PERS'] += 1
                            elif numeric_id in geolocation_subclass:
                                ner_counter['LOC'] += 1
                            elif numeric_id in organization_subclass:
                                ner_counter['ORG'] += 1
                            else:
                                ner_counter['OTHERS'] += 1
                                
                        # Add numeric_id to all NER categories it belongs to

                        for ner_type in ner_counter:
                            if ner_type == 'ORG':
                                print(f"--> {ner_type} ")
                                ORG.append(numeric_id)
                            elif ner_type == 'PERS':
                                print(f"--> {ner_type} ")
                                PERS.append(numeric_id)
                            elif ner_type == 'LOC':
                                print(f"--> {ner_type} ")
                                LOC.append(numeric_id)
                            elif ner_type == 'OTHERS':
                                print(f"--> {ner_type} ")
                                OTHERS.append(numeric_id)

                        
                ################################################################   
                # TRANSITIVE CLOSURE

                if item.get("type") == "item" and "claims" in item:
                    p31_claims = item["claims"].get("P31", [])

                    types_list = []

                    for claim in p31_claims:
                        mainsnak = claim.get("mainsnak", {})
                        datavalue = mainsnak.get("datavalue", {})
                        type_numeric_id = datavalue.get("value", {}).get("numeric-id")
                        types_list.append("Q"+str(type_numeric_id))

                total = []
                for el in types_list:
                    total += retrieve_superclasses(el)
                    superclasses = retrieve_superclasses(el)  # Replace with your entity ID
                    print(f"[{el}] - # superclasses: {len(superclasses)}")
                print(f"Superclasses: [{len(set(total))}] {set(total)}")
                
                print(f"types_list: [{len(types_list)}] {types_list}")
                print("_______________________")

                ################################################################   
                # URL EXTRACTION
            
                try:
                    lang = labels.get("en", {}).get("language", "")
                    tmp={}
                    tmp["WD_id"] = item['id']
                    tmp["WP_id"] = labels.get("en", {}).get("value", "")
            
                    url_dict={}
                    url_dict["wikidata"] = "http://www.wikidata.org/wiki/"+tmp["WD_id"]
                    url_dict["wikipedia"] = "http://"+lang+".wikipedia.org/wiki/"+sitelinks['enwiki']['title'].replace(' ','_')
                    url_dict["dbpedia"] = "http://dbpedia.org/resource/"+sitelinks['enwiki']['title'].replace(' ','_')
                    
            
                except:
                   pass
                
                ################################################################    
        
                objects = {}
                literals = {datatype: {} for datatype in DATATYPES}
                types = {"P31": []}
                join = {
                    "items": {
                        "id_entity": i,
                        "entity": entity,
                        "description": description,
                        "labels": all_labels,
                        "aliases": all_aliases,
                        "types": types,
                        "popularity": popularity,
                        "kind": category,   # kind (entity, type or predicate, disambiguation or category)
                        ######################
                        # new updates
                        "NERtype": NERtype, # (ORG, LOC, PER or OTHERS)
                        "URLs" : url_dict
                        ######################
                    },
                    "objects": { 
                        "id_entity": i,
                        "entity": entity,
                        "objects":objects
                    },
                    "literals": { 
                        "id_entity": i,
                        "entity": entity,
                        "literals": literals
                    },
                    "types": { 
                        "id_entity": i,
                        "entity": entity,
                        "types": types
                    },
                }
            
                predicates = item["claims"]
                for predicate in predicates:
                    for obj in predicates[predicate]:
                        datatype = obj["mainsnak"]["datatype"]
            
                        if check_skip(obj, datatype):
                            continue
            
                        if datatype == "wikibase-item" or datatype == "wikibase-property":
                            value = obj["mainsnak"]["datavalue"]["value"]["id"]
            
                            if predicate == "P31" or predicate == "P106":
                                types["P31"].append(value)
            
                            if value not in objects:
                                objects[value] = []
                            objects[value].append(predicate)    
                        else:
                            value = get_value(obj, datatype)                
                            lit = literals[DATATYPES_MAPPINGS[datatype]]
            
                            if predicate not in lit:
                                lit[predicate] = []
                            lit[predicate].append(value)   
            
                 
            
                for key in buffer:
                    buffer[key].append(join[key])            
            
                if len(buffer["items"]) == BATCH_SIZE:
                    flush_buffer(buffer)
    
        except json.decoder.JSONDecodeError:
            continue
    pbar.close()

  0%|          | 5/332984 [01:44<1938:52:00, 20.96s/it]


Belgium - Q31
--> LOC 
--> ORG 
[Q3624078] - # superclasses: 54
[Q43702] - # superclasses: 48
[Q6256] - # superclasses: 44
[Q20181813] - # superclasses: 55
[Q185441] - # superclasses: 55
[Q1250464] - # superclasses: 53




[Q113489728] - # superclasses: 55
Superclasses: [65] {'Q16334298', 'Q185441', 'Q131085629', 'Q7210356', 'Q2221906', 'Q20181813', 'Q56061', 'Q123349660', 'Q21157127', 'Q27096213', 'Q4835091', 'Q155076', 'Q43229', 'Q7275', 'Q1063239', 'Q106559804', 'Q82794', 'Q488383', 'Q26713767', 'Q16887380', 'Q28108', 'Q103940464', 'Q106668099', 'Q1048835', 'Q58415929', 'Q1896989', 'Q3624078', 'Q16334295', 'Q211606', 'Q6671777', 'Q27096235', 'Q25404640', 'Q124711467', 'Q3455524', 'Q96196009', 'Q124711484', 'Q1140229', 'Q8191099', 'Q24229398', 'Q1639378', 'Q22676603', 'Q98119401', 'Q15642541', 'Q43702', 'Q61961344', 'Q1646605', 'Q99527517', 'Q3778211', 'Q6256', 'Q177634', 'Q53617489', 'Q58778', 'Q618123', 'Q16562419', 'Q23956024', 'Q16686448', 'Q35120', 'Q178706', 'Q18810687', 'Q1250464', 'Q113489728', 'Q854457', 'Q874405', 'Q53617407', 'Q484652'}
types_list: [7] ['Q3624078', 'Q43702', 'Q6256', 'Q20181813', 'Q185441', 'Q1250464', 'Q113489728']
_______________________
happiness - Q8
--> OTHERS 
[Q331769



[Q60539479] - # superclasses: 17
Superclasses: [31] {'Q9415', 'Q7268708', 'Q282250', 'Q12047512', 'Q5127848', 'Q2996394', 'Q813912', 'Q17320256', 'Q1293220', 'Q7048977', 'Q488383', 'Q331769', 'Q58415929', 'Q781413', 'Q64732777', 'Q937228', 'Q30241068', 'Q1190554', 'Q3249551', 'Q3505845', 'Q67518978', 'Q41537118', 'Q12047513', 'Q1322005', 'Q99527517', 'Q483247', 'Q111752858', 'Q35120', 'Q60539479', 'Q20937557', 'Q3968640'}
types_list: [2] ['Q331769', 'Q60539479']
_______________________
George Washington - Q23
--> PERS 




[Q5] - # superclasses: 30
Superclasses: [30] {'Q164509', 'Q5', 'Q223557', 'Q5127848', 'Q106559804', 'Q488383', 'Q7048977', 'Q103940464', 'Q110551885', 'Q27043950', 'Q66394244', 'Q26401003', 'Q729', 'Q215627', 'Q21871294', 'Q12898224', 'Q159344', 'Q154954', 'Q795052', 'Q24229398', 'Q98119401', 'Q99527517', 'Q3778211', 'Q72638', 'Q10855152', 'Q53617489', 'Q35120', 'Q4406616', 'Q7239', 'Q53617407'}
types_list: [1] ['Q5']
_______________________
Jack Bauer - Q24
--> OTHERS 
[Q15632617] - # superclasses: 21
[Q15773317] - # superclasses: 12




[Q20085850] - # superclasses: 12
Superclasses: [23] {'Q28020127', 'Q64728693', 'Q15632617', 'Q95074', 'Q30017383', 'Q115537581', 'Q2593744', 'Q96789464', 'Q3542731', 'Q14897293', 'Q7048977', 'Q97498056', 'Q488383', 'Q103940464', 'Q115257598', 'Q24229398', 'Q15773317', 'Q27277631', 'Q20085850', 'Q53617489', 'Q35120', 'Q16686448', 'Q122192387'}
types_list: [3] ['Q15632617', 'Q15773317', 'Q20085850']
_______________________
Douglas Adams - Q42
--> PERS 




[Q5] - # superclasses: 30
Superclasses: [30] {'Q164509', 'Q5', 'Q223557', 'Q5127848', 'Q106559804', 'Q488383', 'Q7048977', 'Q103940464', 'Q110551885', 'Q27043950', 'Q66394244', 'Q26401003', 'Q729', 'Q215627', 'Q21871294', 'Q12898224', 'Q159344', 'Q154954', 'Q795052', 'Q24229398', 'Q98119401', 'Q99527517', 'Q3778211', 'Q72638', 'Q10855152', 'Q53617489', 'Q35120', 'Q4406616', 'Q7239', 'Q53617407'}
types_list: [1] ['Q5']
_______________________
Paul Otlet - Q1868
--> PERS 
[Q5] - # superclasses: 30
Superclasses: [30] {'Q164509', 'Q5', 'Q223557', 'Q5127848', 'Q106559804', 'Q488383', 'Q7048977', 'Q103940464', 'Q110551885', 'Q27043950', 'Q66394244', 'Q26401003', 'Q729', 'Q215627', 'Q21871294', 'Q12898224', 'Q159344', 'Q154954', 'Q795052', 'Q24229398', 'Q98119401', 'Q99527517', 'Q3778211', 'Q72638', 'Q10855152', 'Q53617489', 'Q35120', 'Q4406616', 'Q7239', 'Q53617407'}
types_list: [1] ['Q5']
_______________________
Wikidata - Q2013





KeyboardInterrupt: 

In [31]:
(english_labels['fulda'])

KeyError: 'fulda'

In [None]:
json_file_path = "./yago_wiki_classification.json"

data = {
    "ORG": ORG,
    "LOC": LOC,
    "PERS": PERS,
    "OTHERS": OTHERS
}

# Write the categorized data to a JSON file
try:
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Data saved successfully to {json_file_path}")
except Exception as e:
    print(f"Error saving data to JSON file: {e}")

In [None]:
logs = parser.parse_all_logs(log_dir="./")
first_log = logs[0]

print(f"Output file name: {first_log['output_filename']}")
print(f"Standard file name: {first_log['standard_filename']}")
print(f"Stopped early: {first_log['early_stop']}")
print(f"Measured consumption: {first_log['actual']}")
print(f"Predicted consumption: {first_log['pred']}")
print(f"Measured GPU devices: {first_log['components']['gpu']['devices']}")

In [None]:
total_length_PERS = len(PERS)
total_length_ORG = len(ORG)
total_length_LOC = len(LOC)
total_length_OTHERS = len(OTHERS)

# Print the total lengths
print("Total lengths:")
print(f"Length of PERS: {total_length_PERS}")
print(f"Length of ORG: {total_length_ORG}")
print(f"Length of LOC: {total_length_LOC}")
print(f"Length of OTHERS: {total_length_OTHERS}")

# Calculate the sum of lengths
total_length = total_length_PERS + total_length_ORG + total_length_LOC + total_length_OTHERS

# Print the sum of lengths
print(f"Total length: {total_length}")

In [None]:
for el in OTHERS:
    if el in PERS:
        print(f"PERS and ORG --> Entity ID: {PERS.index(el)}")
    if el in LOC:
        print(f"LOC and ORG --> Entity ID: {LOC.index(el)}")
    if el in ORG:
        print(f"OTHERS and ORG --> Entity ID: {ORG.index(el)}")

In [None]:
# Convert lists to sets for faster intersection operation
ORG_set = set(ORG)
PERS_set = set(PERS)
LOC_set = set(LOC)
OTHERS_set = set(OTHERS)

# Initialize counters for each set
ORG_counter = 0
PERS_counter = 0
LOC_counter = 0
OTHERS_counter = 0

# Find the overlapping items and update the counters
for item in ORG_set.union(PERS_set, LOC_set, OTHERS_set):
    num_overlaps = 0
    if item in ORG_set:
        print("item")
        num_overlaps += 1
    if item in PERS_set:
        num_overlaps += 1
    if item in LOC_set:
        num_overlaps += 1
    if item in OTHERS_set:
        num_overlaps += 1
    
    # Update the corresponding counter based on the number of overlaps
    if num_overlaps == 1:
        ORG_counter += 1
    elif num_overlaps == 2:
        PERS_counter += 1
    elif num_overlaps == 3:
        LOC_counter += 1
    elif num_overlaps == 4:
        OTHERS_counter += 1

# Print the counts for each set
print("Number of overlaps for each set:")
print(f"ORG: {ORG_counter}")
print(f"PERS: {PERS_counter}")
print(f"LOC: {LOC_counter}")
print(f"OTHERS: {OTHERS_counter}")

## URL Construction

In [None]:
#! /usr/bin/env python3
# This Python file uses the following encoding: utf-8

__author__ = 'jgeiss'


#############################################################################
# authors: Johanna Geiß, Heidelberg University, Germany                     #
# email: geiss@informatik.uni-heidelberg.de                                 #
# Copyright (c) 2017 Database Research Group,                               #
#               Institute of Computer Science,                              #
#               University of Heidelberg                                    #
#   Licensed under the Apache License, Version 2.0 (the "License");         #
#   you may not use this file except in compliance with the License.        #
#   You may obtain a copy of the License at                                 #
#                                                                           #
#   http://www.apache.org/licenses/LICENSE-2.0                              #
#                                                                           #
#   Unless required by applicable law or agreed to in writing, software     #
#   distributed under the License is distributed on an "AS IS" BASIS,       #
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.#
#   See the License for the specific language governing permissions and     #
#   limitations under the License.                                          #
#############################################################################
# last updated 21.3.2017 by Johanna Geiß

from pymongo import *
from pymongo import errors
import configparser



wikidata_dump_path = './my-data/latest-all.json.bz2'

with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
    count = 0
    
             
    for i, line in tqdm(enumerate(f), total=1000):
        if count == 10000:
            break
        try:
            count += 1
            # Parse JSON data from each line
            data = json.loads(line[:-2])
         
            labels = data.get("labels", {})
            lang = labels.get("en", {}).get("language", "")
            entry={}
            entry["WD_id"] = data['id']
            entry["WP_id"] = labels.get("en", {}).get("value", "")

            entry["WD_id_URL"] = "http://www.wikidata.org/wiki/"+entry["WD_id"]
            entry["WP_id_URL"] = "http://"+lang+".wikipedia.org/wiki/"+entry["WP_id"].replace(" ","_")
            entry["dbpedia_URL"] = "http://dbpedia.org/resource/"+entry["WP_id"].capitalize().replace(" ","_")
            
            print("------------------")
            print(entry["WD_id_URL"])
            print(entry["WP_id_URL"])
            print(entry["dbpedia_URL"])
            print("------------------")
    
        except json.decoder.JSONDecodeError:
            continue







In [None]:
from carbontracker import parser

logs = parser.parse_all_logs(log_dir="./")
print(logs)
first_log = logs[0]

print(f"Output file name: {first_log['output_filename']}")
print(f"Standard file name: {first_log['standard_filename']}")
print(f"Stopped early: {first_log['early_stop']}")
print(f"Measured consumption: {first_log['actual']}")
print(f"Predicted consumption: {first_log['pred']}")
print(f"Measured GPU devices: {first_log['components']['gpu']['devices']}")