In [2]:
import bz2
import json
import os
import sys
import traceback
from pymongo import MongoClient
from tqdm import tqdm
from datetime import datetime
from requests import get
import csv

In [3]:
csv_file_path = "./my-data/Dataset/Dataset/2T_Round4/gt/cea.csv"
urls = []
cont = 1000

# Open the CSV file
with open(csv_file_path, mode='r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)

    # Iterate over each row in the CSV file
    for row in csv_reader:
        if cont == 0:
            break
        cont -= 1
        
        # Q_id extraction
        url = row[3].split(' ')
        for el in url:
            parts = el.split('/')
            el = parts[-1]
            urls.append(el)
        
        # unique Q_id creation
        unique_urls = set(urls)
        unique_urls_list = list(unique_urls)        

    print((unique_urls_list))
                

['Q19746800', 'Q13159838', 'Q65298685', 'Q19747625', 'Q25534363', 'Q27', 'Q3637557', 'Q884', 'Q19747489', 'Q336306', 'Q25706135', 'Q13177145', 'Q19747388', 'Q763', 'Q13109239', 'Q491552', 'Q214449', 'Q13187202', 'Q22022623', 'Q20087860', 'Q65451409', 'Q482988', 'Q31835482', 'Q679322', 'Q75', 'Q3613002', 'Q19746317', 'Q88278890', 'Q19739660', 'Q19747583', 'Q16044283', 'Q869', 'Q65453658', 'Q191802', 'Q12098965', 'Q19747385', 'Q97265250', 'Q20087872', 'Q25161558', 'Q755676', 'Q12577442', 'Q19747631', 'Q20086235', 'Q13057957', 'Q336291', 'Q19445112', 'Q22236711', 'Q754', 'Q13161638', 'Q12248450', 'Q12944354', 'Q12828850', 'Q25581992', 'Q15723389', 'Q20087198', 'Q25649051', 'Q31820683', 'Q326886', 'Q215077', 'Q13177462', 'Q25580745', 'Q19735258', 'Q484161', 'Q146502', 'Q845868', 'Q19747757', 'Q65450436', 'Q1870', 'Q65425691', 'Q974', 'Q65453589', 'Q243996', 'Q15689659', 'Q64861647', 'Q65453624', 'Q702', 'Q273244', 'Q20085956', 'Q13425766', 'Q242723', 'Q736', 'Q157999', 'Q5513', 'Q15722975'

In [23]:

def create_indexes(db):
    # Specify the collections and their respective fields to be indexed
    index_specs = {
        'cache': ['cell', 'lastAccessed'],  # Example: Indexing 'cell' and 'type' fields in 'cache' collection
        'items': ['id_entity', 'entity', 'category', 'popularity'],
        'literals': ['id_entity', 'entity'],
        'mappings': ['curid', 'wikipedia_id', 'wikidata_id', 'dbpedia_id'],
        'objects': ['id_entity', 'entity'],
        'types': ['id_entity', 'entity']
    }

    for collection, fields in index_specs.items():
        if collection == "cache":
            db[collection].create_index([('cell', 1), ('fuzzy', 1), ('type', 1), ('kg', 1), ('limit', 1)], unique=True)
        elif collection == "items":
            db[collection].create_index([('entity', 1), ('category', 1)], unique=True)    
        for field in fields:
            db[collection].create_index([(field, 1)])  # 1 for ascending order


# Initial Estimation
initial_estimated_average_size = 800  # Initial average size in bytes, can be adjusted
BATCH_SIZE = 100 # Number of entities to insert in a single batch

if len(sys.argv) < 2:
    print("Usage: python script_name.py <path_to_wikidata_dump>")
    sys.exit(1)

file_path = './my-data/latest-all.json.bz2'  # Get the file path from command line argument
compressed_file_size = os.path.getsize(file_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

file = bz2.BZ2File(file_path, "r")

# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
current_date = datetime.now()
formatted_date = current_date.strftime("%d%m%Y")
DB_NAME = f"wikidata{formatted_date}"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

create_indexes(client[DB_NAME])

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

DATATYPES_MAPPINGS = {
    'external-id':'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))
total_size_processed = 0
num_entities_processed = 0



def update_average_size(new_size):
    global total_size_processed, num_entities_processed
    total_size_processed += new_size
    num_entities_processed += 1
    return total_size_processed / num_entities_processed


def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }
    
    return datatype in skip


def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value


def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []
        
            
def parse_data(item, i, geolocation_subclass, organization_subclass):
    entity = item["id"]
    labels = item.get("labels", {})
    aliases = item.get("aliases", {})
    description = item.get('descriptions', {}).get('en', {})
    category = "entity"
    sitelinks = item.get("sitelinks", {})
    popularity = len(sitelinks) if len(sitelinks) > 0 else 1
    
    all_labels = {}
    for lang in labels:
        all_labels[lang] = labels[lang]["value"]

    all_aliases = {}
    for lang in aliases:
        all_aliases[lang] = []
        for alias in aliases[lang]:
            all_aliases[lang].append(alias["value"])
        all_aliases[lang] = list(set(all_aliases[lang]))

    found = False
    for predicate in item["claims"]:
        if predicate == "P279":
            found = True

    if found:
        category = "type"
    if entity[0] == "P":
        category = "predicate"

    
    objects = {}
    literals = {datatype: {} for datatype in DATATYPES}
    types = {"P31": []}
    join = {
        "items": {
            "id_entity": i,
            "entity": entity,
            "description": description,
            "labels": all_labels,
            "aliases": all_aliases,
            "types": types,
            "popularity": popularity,
            "category": category,   # kind (entity, type or predicate)
            ######################
            # new updates
            "NERtype": NERtype, # (ORG, LOC, PER or OTHERS)
            "URLs" : url_dict
            ######################
        },
        "objects": { 
            "id_entity": i,
            "entity": entity,
            "objects":objects
        },
        "literals": { 
            "id_entity": i,
            "entity": entity,
            "literals": literals
        },
        "types": { 
            "id_entity": i,
            "entity": entity,
            "types": types
        },
    }

    predicates = item["claims"]
    for predicate in predicates:
        for obj in predicates[predicate]:
            datatype = obj["mainsnak"]["datatype"]

            if check_skip(obj, datatype):
                continue

            if datatype == "wikibase-item" or datatype == "wikibase-property":
                value = obj["mainsnak"]["datavalue"]["value"]["id"]

                if predicate == "P31" or predicate == "P106":
                    types["P31"].append(value)

                if value not in objects:
                    objects[value] = []
                objects[value].append(predicate)    
            else:
                value = get_value(obj, datatype)                
                lit = literals[DATATYPES_MAPPINGS[datatype]]

                if predicate not in lit:
                    lit[predicate] = []
                lit[predicate].append(value)   

     

    for key in buffer:
        buffer[key].append(join[key])            

    if len(buffer["items"]) == BATCH_SIZE:
        flush_buffer(buffer)


def parse_wikidata_dump(el):            
    global initial_total_lines_estimate

    file = bz2.BZ2File(file_path, "r")
    pbar = tqdm(total=initial_total_lines_estimate)
    for i, line in enumerate(file):
        try:
            item = json.loads(line[:-2])  # Remove the trailing characters
            
            if int(item["id"][1:]) > int(el[1:]):
                break
            if item["id"] != el:
                continue
                
            print(f"id: {item['id']}, description: {item['description']}")
            line_size = len(line)
            current_average_size = update_average_size(line_size)

            # Dynamically update the total based on the current average size
            pbar.total = round(compressed_file_size / current_average_size)
            pbar.update(1)

            parse_data(item, i, geolocation_subclass, organization_subclass)
        except json.decoder.JSONDecodeError:
            continue
        except Exception as e:
            traceback_str = traceback.format_exc()
            log_c.insert_one({"entity": item["id"], "error": str(e), "traceback_str": traceback_str})

    pbar.close()

csv_file_path = "./my-data/Dataset/Dataset/2T_Round4/gt/cea.csv"
urls = []
cont = 1000

# Open the CSV file
with open(csv_file_path, mode='r') as file_gt:
    # Create a CSV reader object
    csv_reader = csv.reader(file_gt)

    # Iterate over each row in the CSV file
    for row in csv_reader:
        if cont == 0:
            break
        cont -= 1
        
        # Q_id extraction
        url = row[3].split(' ')
        for el in url:
            parts = el.split('/')
            el = parts[-1]
            urls.append(el)
            print(el)
            parse_wikidata_dump(el)
        
        # unique Q_id creation
        unique_urls = set(urls)
        unique_urls_list = list(unique_urls)        

    print((unique_urls_list))


Q5484




  0%|          | 0/109787984.95125 [00:53<?, ?it/s][A[A
  0%|          | 0/109787984.95125 [00:03<?, ?it/s]


Q25678769


  0%|          | 0/109787984.95125 [32:04<?, ?it/s]


Q26858014


  0%|          | 0/109787984.95125 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
text = "Q124"
number = int(text[1:])  # Extract all characters starting from index 1 and convert to integer
print(number)  # Output: 124


124
