In [6]:
import warnings
from typing import List
import math

import pandas as pd
from glob import glob
import re
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Get a list of all xlsx files
files = sorted(glob("./CPs exp 2a data/*.xlsx"))[:1]

# all_string_values = []
situations = []
actions = []

print(f"number of files: {len(files)}\n")
# Loop through the list of files and read each file
for file in files:
    dataframe = pd.read_excel(file)
    # Now you can do something with the data
    # print(dataframe.columns)
    assert len(dataframe.situation) == len(dataframe.actions)

    for situation, action in zip(dataframe.situation, dataframe.actions):
        # Parse HTML content
        situation_parsed = []

        if isinstance(situation, float) and math.isnan(situation):
            situation_parsed = None

        else:
            soup = BeautifulSoup(situation, "html.parser")

            # Extract data
            for div in soup.find_all("div", class_="item"):
                # Extract class and text content

                class_name = (
                    div.get("class")[1]
                    if len(div.get("class")) > 1
                    else div.get("class")[0]
                )
                # Process the text content more carefully to add spaces around buttons or special divs
                texts = div.find_all(text=True)
                text_content = " ".join(text.strip() for text in texts).strip().lower()
                situation_parsed.append((class_name, text_content))

        situations.append(situation_parsed)

        soup = BeautifulSoup(action, "html.parser")
        action_parsed = {}

        # Extract boxes
        boxes = soup.find_all("div", class_="box box_action")

        for box in boxes:
            box_id = box.get("id")
            action_parsed[box_id] = []

            items = box.find_all("div", class_="item")
            for item in items:
                class_name = (
                    item.get("class")[1]
                    if len(item.get("class")) > 1
                    else item.get("class")[0]
                )
                # Process the text content more carefully to add spaces around buttons or special divs
                texts = item.find_all(text=True)
                text_content = " ".join(text.strip() for text in texts).strip().lower()
                action_parsed[box_id].append((class_name, text_content))


        actions.append(action_parsed)
        assert len(situations) == len(actions)

situation_action_pairs = [for (situations, actions)]
# print(f"The number of sitation_action pairs: {len(situation_action_pairs)}")

number of files: 1



In [7]:
len(situations), len(actions)

(29, 29)

In [12]:
for idx, (s, a) in enumerate(situation_action_pairs):
    print(idx)
    print(s)
    print(a)
    print()

0
[('locations', 'top of rock pile'), ('objects', 'large rock')]
{'patternbox_1': [('agents', 'robot'), ('actions', 'move to object'), ('objects', 'large rock')], 'patternbox_2': [('agents', 'human'), ('actions', 'move to object'), ('objects', 'large rock')], 'patternbox_3': [('agents', 'robot'), ('actions', 'break object in location'), ('objects', 'large rock'), ('locations', 'right side of rock pile')], 'patternbox_4': [('agents', 'robot'), ('actions', 'break object in location'), ('objects', 'large rock'), ('locations', 'top of rock pile')], 'patternbox_5': [('agents', 'human'), ('actions', 'pick up object in location'), ('objects', 'small rock'), ('locations', 'top of rock pile')], 'patternbox_6': [('agents', 'robot'), ('actions', 'pick up object in location'), ('objects', 'small rock'), ('locations', 'top of rock pile')]}

1
[('locations', 'top of rock pile'), ('objects', 'large rock')]
{'patternbox_1': [('agents', 'robot'), ('actions', 'move to object'), ('objects', 'large rock')

In [13]:
situations[0]

[('locations', 'top of rock pile'), ('objects', 'large rock')]

In [14]:
actions[0]

[[('agents', 'human'),
  ('actions', 'pick up object in location'),
  ('objects', 'large rock'),
  ('locations', 'top of rock pile')],
 [('agents', 'robot'),
  ('actions', 'break object in location'),
  ('objects', 'large rock'),
  ('locations', 'top of rock pile')],
 [('agents', 'robot'),
  ('actions', 'break object in location'),
  ('objects', 'large rock'),
  ('locations', 'top of rock pile')]]

In [6]:
len(situations), len(actions)

(969, 219)

In [32]:
actions[0]

[('agents', 'human'),
 ('actions', 'pick up object in location'),
 ('objects', 'large rock'),
 ('locations', 'top of rock pile')]

In [None]:
print(f"number of total and unique all_string_values: {len(all_string_values)}, {len(set(all_string_values))}")

foo = [str(sa) for sa in situation_action_pairs]
print(f"number of total and unique situation_action_pairs: {len(foo)}, {len(set(foo))}")

foo = [str(s) for s, a in situation_action_pairs]
print(f"number of total and unique situations: {len(foo)}, {len(set(foo))}")

foo = [str(a) for s, a in situation_action_pairs]
print(f"number of total and unique actions: {len(foo)}, {len(set(foo))}")