In [1]:
from tqdm import tqdm
import argparse
from utils import *
from freebase import *
from propagation import *
import random
import concurrent.futures



parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str,
                    default="cwq", help="choose the dataset from {webqsp, cwq}.")
parser.add_argument("--max_length", type=int,
                    default=1024, help="the max length of LLMs output.")
parser.add_argument("--temperature", type=float,
                    default=0., help="the temperature")
parser.add_argument("--llm", type=str,
                    default="llama-3", help="choose base LLM model from {llama, gpt-3.5-turbo, gpt-4}.")
parser.add_argument("--openai_api_key", type=str,
                    default="", help="if the LLM is gpt-3.5-turbo or gpt-4, you need add your own openai api key.")
parser.add_argument('--verbose', action='store_true', help="print LLM input and output.")
args = parser.parse_args(["--verbose"])
# args = parser.parse_args("")


datas, question_string = prepare_dataset(args.dataset)

In [2]:
data = datas[3]
question = data[question_string]
topics = data['topic_entity']
paths = {topics[topic]: {} for topic in topics}
print(question)

What country bordering France contains an airport that serves Nijmegen?


In [3]:
# for topic in topics:
for topic in topics:
    topic_name = topics[topic]
    # 1-hop propagation
    relations = get_relations(question, topic, topic_name, args, 3)
    entities_id, entities_name = get_entities(topic, relations)
    [paths[topic_name].update({r: {"entities_id": entities_id[i], "entities_name": entities_name[i]}}) for i, r in enumerate(relations)]
    facts = propagate(question, topic_name, relations, paths[topic_name], 1, args)
    [paths[topic_name][r].update({"fact": facts[i]}) for i, r in enumerate(relations)]
    # 2-hop propagation
    relations = get_relations_distant(question, topic, topic_name, relations, paths[topic_name], args, 3)
    entities_id, entities_name = get_entities_distant(paths[topic_name], relations)
    [paths[topic_name].update({r: {"entities_id": entities_id[i], "entities_name": entities_name[i]}}) for i, r in enumerate(relations)]
    facts = propagate(question, topic_name, relations, paths[topic_name], 2, args)
    [paths[topic_name][r].update({"fact": facts[i], "entities_id": sum(entities_id[i], []), "entities_name": sum(entities_name[i], [])}) for i, r in enumerate(relations)]
    # # 3-hop propagation
    relations = get_relations_distant(question, topic, topic_name, relations, paths[topic_name], args, 3)
    entities_id, entities_name = get_entities_distant(paths[topic_name], relations)
    [paths[topic_name].update({r: {"entities_id": entities_id[i], "entities_name": entities_name[i]}}) for i, r in enumerate(relations)]
    facts = propagate(question, topic_name, relations, paths[topic_name], 3, args)
    [paths[topic_name][r].update({"fact": facts[i]}) for i, r in enumerate(relations)]
    # # clean paths
    [paths[topic_name].update({r: paths[topic_name][r]['fact']}) for r in paths[topic_name]]


Given the question, we have the topic of the question and its relations.

question: What country bordering France contains an airport that serves Nijmegen?
topic: France

Based on the question and its topic, please select the most relevant relations to answer the question from below and just return top 3 relations in a numbered list without explanation.
relations: location.dated_location.date_founded, location.location.area, location.location.containedby, location.location.contains, location.location.events, location.location.geolocation, location.location.people_born_here, location.location.time_zones, location.statistical_region.population, organization.organization_founder.organizations_founded, sports.sports_team_location.teams, symbols.name_source.namesakes, topic_server.population_number, base.aareas.schema.administrative_area.administrative_area_type, base.aareas.schema.administrative_area.administrative_children, base.aareas.schema.administrative_area.administrative_parent, bas

In [7]:
facts = construct_facts(paths, topics)
prompt = question_prompt.format(facts, question)
response = run_llm(prompt, args.temperature, args.max_length, args.openai_api_key, args.llm, args.verbose)
output = {"question": question, "result": response, "paths": paths}

Based on the given the facts and your own knowledge, please the answer the question as simple as possible and return all the possible answers in a numbered list.

information: Here are some facts about topic France that may related to the question.
1. France is located in Western Europe and Europe, which may help narrow down the search for the country bordering France.
	1.1. Europe and Western Europe have population statistics for various years, but this information is not directly relevant to finding the country with an airport serving Nijmegen.
		1.1.1. This fact provides various population statistics for different years, but it is not directly relevant to finding the country with an airport serving Nijmegen.
		1.1.2. This fact provides various years, but it is not directly relevant to finding the country with an airport serving Nijmegen.

2. The geolocation of France (latitude: 47.0, longitude: 2.0) is not directly relevant to finding the country with an airport serving Nijmegen, bu

In [4]:
relations

['location.location.containedby->location.country.administrative_divisions->location.statistical_region.population',
 'location.location.containedby->location.country.administrative_divisions->location.administrative_division.capital',
 'location.location.nearby_airports->aviation.airport.serves->location.location.geolocation']

In [134]:
@timer_func
def get_entities_distant_1(paths, relations):
    ids, names = [], []
    for relation in relations:
        entities_id, entities_name = [], []
        topics = paths[relation.rsplit('->', 1)[0]]['entities_id']
        for topic in topics:
            tail_entities = execute_sparql(sparql_tail_entities % (topic, relation.rsplit('->', 1)[1]))
            ### !!! some relations like m.04n32 --> music.artist.track has 8477 tail entities
            tail_entities_id, tail_entities_name = filter_entities(tail_entities, topic)
            # tail_entities_id, tail_entities_name = list(set(tail_entities_id)), list(set(tail_entities_name))
            # assert len(tail_entities_id) == len(tail_entities_name), 'Entities with same name exist!'
            entities_id.append(tail_entities_id)
            entities_name.append(tail_entities_name)
        ids.append(entities_id)
        names.append(entities_name)
    return ids, names


def filter_entity(i, topic):
    entity_id = i['e']['value']
    if entity_id.startswith("http://rdf.freebase.com/ns/"):
        entity_id = entity_id.replace("http://rdf.freebase.com/ns/", "")
        entity_name = get_entity_name(entity_id, topic)
    elif i['e']['type'] in ['literal', 'typed-literal']: # text entities (has no id, no head relations)
        entity_id = i['e']['type']
        entity_name = entity_id

    return entity_id, entity_name

def filter_entities(sparql_output, topic, remove_na=False):
    entities_id, entities_name = [], []

    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        results = list(executor.map(filter_entity, [(i, topic) for i in sparql_output]))
        # futures = [executor.submit(filter_entity, i, topic) for i in sparql_output]
        # for future in concurrent.futures.as_completed(futures):
        #     entity_id, entity_name = future.result()
        #     entities_id.append(entity_id)
        #     entities_name.append(entity_name)
    # entities_id, entities_name = [i[0] for i in results], [i[1] for i in results]

    if remove_na:
        keep_index = [i for i, name in enumerate(entities_name) if name != 'NA']
        entities_id = [entities_id[i] for i in keep_index]
        entities_name = [entities_name[i] for i in keep_index]

    return entities_id, entities_name

In [123]:
a, b = get_entities_distant(paths[topic_name], relations)

Function 'get_entities_distant' executed in 51.5400s


In [136]:
c, d = get_entities_distant_1(paths[topic_name], relations)

Function 'get_entities_distant_1' executed in 18.1169s


In [139]:
d

[[['number: 3600784, year: 2015-01-01',
   'number: 3577032, year: 2014-01-01',
   'number: 3560205, year: 2012-09-30'],
  ['number: 473941, year: 2007, source: Population, City population, UNdata',
   'number: 463826, year: 2003, source: Population, City population, UNdata',
   'number: 482742',
   'number: 485818',
   'number: 481864, year: 2009, source: Population, City population, UNdata',
   'number: 475904',
   'number: 495083, year: 2011, source: Population, City population, UNdata',
   'number: 475627, year: 2006, source: Population, City population, UNdata',
   'number: 472096, year: 2005, source: Population, City population, UNdata',
   'number: 488553, year: 2010, source: Population, City population, UNdata',
   'number: 475681, year: 2008, source: Population, City population, UNdata',
   'number: 474244'],
  ['number: 582640, year: 2014-01-01'],
  ['number: 380621, year: 2014-01-01',
   'number: 381077, year: 2012-12-31',
   'number: 381202, year: 2012-09-30'],
  ['number: 

In [138]:
str(b) == str(d)

False

In [86]:
sum([len(i) for i in a])

66

In [4]:
import concurrent.futures
import time

def io_bound_task(task_id, delay):
    print(f"Task {task_id} starting...")
    time.sleep(delay)  # Simulate an IO-bound task
    print(f"Task {task_id} completed!")
    return str(task_id), task_id

# List of tasks with their respective delays
tasks = [(1, 2), (2, 3), (3, 1), (4, 4), (5, 2)]

# Using ThreadPoolExecutor to run tasks in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(io_bound_task, task_id, delay) for task_id, delay in tasks]
    
    # Collecting results as they complete
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        print(f"Task {result} is done.")

Task 1 starting...Task 2 starting...Task 3 starting...
Task 4 starting...Task 5 starting...



Task 3 completed!
Task ('3', 3) is done.
Task 5 completed!Task 1 completed!

Task ('5', 5) is done.
Task ('1', 1) is done.
Task 2 completed!
Task ('2', 2) is done.
Task 4 completed!
Task ('4', 4) is done.


In [35]:
result

('4', 4)

In [103]:
import concurrent.futures

def process_item(item):
    # Your processing code here
    # Example: returning the item and its square
    return item, item * item

items = [1, 2, 3, 4, 5]  # Example list of items to process

def process_in_parallel(items):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results, b = list(executor.map(process_item, items))
    return results, b

ordered_results = process_in_parallel(items)
print(ordered_results)

ValueError: too many values to unpack (expected 2)

In [46]:
import concurrent.futures

def compute(x, y):
    # Simulate a time-consuming computation
    time.sleep(0.1)
    return x * y

def task(pair):
    x, y = pair
    return compute(x, y)

pairs = [(i, j) for i in range(10) for j in range(10)]
results = []

with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    results = list(executor.map(task, pairs))
    futures = [executor.submit(task, pair) for pair in pairs]
    for future in concurrent.futures.as_completed(futures):
        results.append(future.result())



In [47]:
a, b = [1, 2]

In [48]:
a

1