In [1]:
from datasets import load_dataset
from typing import Dict, List, Tuple, Union
import re

  from .autonotebook import tqdm as notebook_tqdm


## Load Travel Planner Data

In [2]:
from enum import Enum

class REF_INFO_TYPE(Enum):
    ATTRACTIONS = 0
    RESTAURANTS = 1
    FLIGHTS = 2
    ACCOMMODATIONS = 3
    TAXI = 4
    SELF_DRIVING = 5
    
    def __str__(self):
        if self == REF_INFO_TYPE.ATTRACTIONS:
            return "Attractions"
        elif self == REF_INFO_TYPE.RESTAURANTS:
            return "Restaurants"
        elif self == REF_INFO_TYPE.FLIGHTS:
            return "Flight"
        elif self == REF_INFO_TYPE.ACCOMMODATIONS:
            return "Accommodations"
        elif self == REF_INFO_TYPE.TAXI:
            return "Taxi"
        elif self == REF_INFO_TYPE.SELF_DRIVING:
            return "Self-driving"


DATA_SPLIT = ("train", "validation", "test")

In [3]:
def get_query_data_list(data_split:str) -> List[Dict]:
    assert data_split in DATA_SPLIT
    return load_dataset('osunlp/TravelPlanner',data_split)[data_split]

In [4]:
def extract_from_to(text: str) -> Tuple:
    """
    Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
    
    Args:
    - text (str): The input string.
    
    Returns:
    - tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
    """
    pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
    matches = re.search(pattern, text)
    if not matches:
        return (None, None)
    groups = matches.groups() 
    # flights match
    if "Flight" in text:
        origin = groups[0]
        destination, date = groups[1].split(" ")[0], groups[1].split(" ")[-1]
        return (origin, destination, date)
    return groups

In [5]:
def extract_reference_info_list(query_data: Dict) -> List[Dict]:
    return eval(query_data['reference_information'])

In [6]:
def get_ref_info_type(ref_info:Dict) -> REF_INFO_TYPE:
    description = ref_info["Description"]
    for info_type in REF_INFO_TYPE:
        if str(info_type) in description:
            return info_type
    assert False, f"error when getting get_ref_info_type: {description}"


def print_ref_info(ref_info):
    print(ref_info["Description"])
    for content in ref_info['Content'].split("\n"):
        print(content)
        
    
def extract_ref_info_argument(ref_info:Dict, info_type:REF_INFO_TYPE) -> Union[str, Tuple[str, str]]:
    description = ref_info["Description"]
    # transportations
    if info_type in (REF_INFO_TYPE.SELF_DRIVING, REF_INFO_TYPE.TAXI):
        origin, destination = extract_from_to(description)
        assert origin is not None and destination is not None, f"extract_ref_info_argument failed: {description}"
        return {
            "origin": origin.strip(" "), 
            "destination": destination.strip(" ")
        }
    if info_type == REF_INFO_TYPE.FLIGHTS:
        origin, destination, date = extract_from_to(description)
        return {
                "origin": origin.strip(" "), 
                "destination": destination.strip(" "),
                "date": date.strip(" ")
            }
    # pois
    elif info_type in (REF_INFO_TYPE.ATTRACTIONS, REF_INFO_TYPE.RESTAURANTS, REF_INFO_TYPE.ACCOMMODATIONS):
        city = description.split("in")[-1].strip(" ")
        return { "city": city.strip(" ") }
    else:    
        assert False, f"wrong input info type: {info_type}"
    


In [7]:
query_data_list = get_query_data_list("train")
for query_data in query_data_list:
    ref_info_list = extract_reference_info_list(query_data)
    for info in ref_info_list:
        print(info["Description"])
        info_type = get_ref_info_type(info)
        arguments = extract_ref_info_argument(info, info_type)
        print(f"info_type: {info_type}, arguments: {arguments}")
        print(info["Content"])
        print()

Attractions in Rockford
info_type: Attractions, arguments: {'city': 'Rockford'}
                                   Name  Latitude  Longitude                                                          Address          Phone                                                                 Website     City
       Burpee Museum of Natural History 42.277324 -89.088142                           737 N Main St, Rockford, IL 61103, USA (815) 965-3433                                                  http://www.burpee.org/ Rockford
                  Midway Village Museum 42.280499 -88.984640                        6799 Guilford Rd, Rockford, IL 61107, USA (815) 397-9112                                          https://www.midwayvillage.com/ Rockford
                Discovery Center Museum 42.277105 -89.089430                           711 N Main St, Rockford, IL 61103, USA (815) 963-6769                                   http://www.discoverycentermuseum.org/ Rockford
Tinker Swiss Cottage Museum and 

In [8]:
query_data_list[0]

{'org': 'St. Petersburg',
 'dest': 'Rockford',
 'days': 3,
 'visiting_city_number': 1,
 'date': "['2022-03-16', '2022-03-17', '2022-03-18']",
 'people_number': 1,
 'local_constraint': "{'house rule': None, 'cuisine': None, 'room type': None, 'transportation': None}",
 'budget': 1700,
 'query': 'Please help me plan a trip from St. Petersburg to Rockford spanning 3 days from March 16th to March 18th, 2022. The travel should be planned for a single person with a budget of $1,700.',
 'level': 'easy',
 'annotated_plan': "[{'org': 'St. Petersburg', 'dest': 'Rockford', 'days': 3, 'visiting_city_number': 1, 'date': ['2022-03-16', '2022-03-17', '2022-03-18'], 'people_number': 1, 'local_constraint': {'house rule': None, 'cuisine': None, 'room type': None, 'transportation': None}, 'budget': 1700, 'query': 'Please help me plan a trip from St. Petersburg to Rockford spanning 3 days from March 16th to March 18th, 2022. The travel should be planned for a single person with a budget of $1,700.', 'leve

## Load Tools

In [9]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [10]:
from tools.accommodations.apis import Accommodations
from tools.attractions.apis import Attractions
from tools.cities.apis import Cities
from tools.flights.apis import Flights
from tools.restaurants.apis import Restaurants
from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix

In [11]:
flight = Flights()

Flights API loaded.


In [12]:
accommodation = Accommodations()
attraction = Attractions()
restaurant = Restaurants()
google_dist = GoogleDistanceMatrix()

Accommodations loaded.
Attractions loaded.
Restaurants loaded.
GoogleDistanceMatrix loaded.


In [13]:
len(accommodation.data), len(attraction.data), len(restaurant.data), len(flight.data), len(google_dist.data)

(4285, 5302, 9551, 3827360, 17602)

In [14]:
google_dist.data.iloc[100]

origin                 Lubbock
destination         San Angelo
cost                       NaN
duration       2 hours 51 mins
distance                295 km
Name: 100, dtype: object

## Mobi Data Conversion

In [47]:
REF_INFO_CATEGORY = {
    "poi": [
        REF_INFO_TYPE.ACCOMMODATIONS,
        REF_INFO_TYPE.ATTRACTIONS,
        REF_INFO_TYPE.RESTAURANTS
    ],
    "transportation": [
        REF_INFO_TYPE.FLIGHTS,
        REF_INFO_TYPE.SELF_DRIVING,
        REF_INFO_TYPE.TAXI
    ]
}


def get_structured_ref_info(ref_info_type, arguments) -> Tuple[dict, int]:
    result = None
    
    # 3 different categories of pois
    if ref_info_type == REF_INFO_TYPE.ACCOMMODATIONS:
        result = accommodation.run_for_mobi(arguments["city"])
    if ref_info_type == REF_INFO_TYPE.ATTRACTIONS:
        result = attraction.run_for_mobi(arguments["city"])
    if ref_info_type == REF_INFO_TYPE.RESTAURANTS:
        result = restaurant.run_for_mobi(arguments["city"])
        
    # travel options
    if ref_info_type == REF_INFO_TYPE.FLIGHTS:
        result = flight.run_for_mobi(origin=arguments["origin"], 
                          destination=arguments["destination"], 
                          departure_date=arguments["date"])
    
    if ref_info_type in (REF_INFO_TYPE.SELF_DRIVING, REF_INFO_TYPE.TAXI):
        result = google_dist.run_for_mobi(origin=arguments["origin"], 
                                        destination=arguments["destination"], 
                                        mode=str(ref_info_type))

    
    if result is None:
        return None, -1
    
    # result is a dataframe
    if ref_info_type in REF_INFO_CATEGORY["poi"] or ref_info_type == REF_INFO_TYPE.FLIGHTS:
        number = len(result)
        return result.reset_index().drop(columns=['index']).to_dict(), number
    
    # ref_info_type is SELF_DRIVING or TAXI, the result is just a dict
    return result, 1

In [48]:
def get_structured_ref_info_list(ref_info_list):
    result = []
    for ref_info in ref_info_list:
        ref_info_type = get_ref_info_type(ref_info)
        arguments = extract_ref_info_argument(ref_info, ref_info_type)
        structured_ref_info, number = get_structured_ref_info(ref_info_type, arguments)
        result.append({
            "Info Type": str(ref_info_type),
            "Arguments": arguments,
            "Structured Content": structured_ref_info,
            "Number": number
        })
    return result

In [49]:
def mobi_data_conversion(data_split:str):
    assert data_split in DATA_SPLIT
    mobi_data_list = []
    query_data_list = get_query_data_list(data_split)
    for i, query_data in enumerate(query_data_list):
        ref_info_list = extract_reference_info_list(query_data_list[0])
        structured_ref_info_list = get_structured_ref_info_list(ref_info_list)
    

In [50]:
import os
from collections import defaultdict
from tqdm import tqdm

data_split = "train"
possible_local_constraints = defaultdict(list)

mobi_data_list = []
query_data_list = get_query_data_list(data_split)
for i, query_data in tqdm(enumerate(query_data_list), total=len(query_data_list)):
    ref_info_list = extract_reference_info_list(query_data)
    structured_ref_info_list = get_structured_ref_info_list(ref_info_list)
    
    mobi_data_list.append({
        "original_data_index": i,
        "org": query_data['org'],
        "dest": query_data['dest'],
        "days": query_data['days'],
        "date": eval(query_data['date']),
        "people_number": query_data["people_number"],
        "budget": query_data["budget"],
        "query": query_data["query"],
        "local_constraint": eval(query_data["local_constraint"]),
        "annotated_plan": eval(query_data["annotated_plan"]),
        "origin_ref_info": eval(query_data['reference_information']),
        "structured_ref_info": structured_ref_info_list
    })

100%|██████████| 45/45 [00:17<00:00,  2.62it/s]


In [53]:
mobi_data_list[-1]#['origin_ref_info']

{'original_data_index': 44,
 'org': 'Minneapolis',
 'dest': 'Ohio',
 'days': 7,
 'date': ['2022-03-17',
  '2022-03-18',
  '2022-03-19',
  '2022-03-20',
  '2022-03-21',
  '2022-03-22',
  '2022-03-23'],
 'people_number': 2,
 'budget': 5100,
 'query': 'We require a 7-day travel itinerary for two leaving from Minneapolis and covering three cities in Ohio, starting from March 17th to March 23rd, 2022. Our budget is up to $5,100. We will be accompanied by our pets, so we need pet-friendly accommodations. Our meals should preferably include American, Mediterranean, Chinese, and Italian cuisines. Please note we prefer not to take any flights so our travel plan should not include them.',
 'local_constraint': {'house rule': 'pets',
  'cuisine': ['American', 'Mediterranean', 'Chinese', 'Italian'],
  'room type': None,
  'transportation': 'no flight'},
 'annotated_plan': [{'org': 'Minneapolis',
   'dest': 'Ohio',
   'days': 7,
   'visiting_city_number': 3,
   'date': ['2022-03-17',
    '2022-03-18