In [47]:
import warnings
from typing import List
import math

import pandas as pd
from glob import glob
import re
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Get a list of all xlsx files
files = sorted(glob("CPs exp 2a data/*.xlsx"))

# all_string_values = []
situations = []
actions = []

print(f"number of files: {len(files)}\n")
# Loop through the list of files and read each file
for file in files:
    print(f"processing `{file}` ...")
    dataframe = pd.read_excel(file)
    dataframe.drop_duplicates(subset="name", keep="last", inplace=True)
    # Now you can do something with the data
    assert len(dataframe.situation) == len(dataframe.actions)
    print(
        f"There are in total of {len(dataframe.situation)} situation-action pairs "
        f"in the file {file}.\n"
    )

    for situation, action in zip(dataframe.situation, dataframe.actions):
        # Parse HTML content
        situation_parsed = []

        if isinstance(situation, float) and math.isnan(situation):
            situation_parsed = None

        else:
            soup = BeautifulSoup(situation, "html.parser")

            # Extract data
            for div in soup.find_all("div", class_="item"):
                # Extract class and text content

                class_name = (
                    div.get("class")[1]
                    if len(div.get("class")) > 1
                    else div.get("class")[0]
                )
                # Process the text content more carefully to add spaces around buttons
                # or special divs
                texts = div.find_all(text=True)
                text_content = " ".join(text.strip() for text in texts).strip().lower()
                situation_parsed.append((class_name, text_content))

        situations.append(situation_parsed)

        soup = BeautifulSoup(action, "html.parser")
        action_parsed = {}

        # Extract boxes
        boxes = soup.find_all("div", class_="box box_action")

        for box in boxes:
            box_id = box.get("id")
            action_parsed[box_id] = []

            items = box.find_all("div", class_="item")
            for item in items:
                class_name = (
                    item.get("class")[1]
                    if len(item.get("class")) > 1
                    else item.get("class")[0]
                )
                # Process the text content more carefully to add spaces around buttons
                # or special divs
                texts = item.find_all(text=True)
                text_content = " ".join(text.strip() for text in texts).strip().lower()
                action_parsed[box_id].append((class_name, text_content))

        actions.append(action_parsed)
        assert len(situations) == len(actions)

situation_action_pairs = list(zip(situations, actions))

print(
    f"In total of {len(situation_action_pairs)} situation-action pairs have "
    f"been processed.\n"
)

# Create a new list with elements that don't have None as the situation
situation_action_pairs = [
    (situation, action)
    for situation, action in situation_action_pairs
    if situation is not None
]
print(
    f"After removing None, now there is {len(situation_action_pairs)} "
    f"situation-action pairs.\n"
)

for idx, (situation, actions_dict) in enumerate(situation_action_pairs.copy()):
    actions_list = []
    for action in actions_dict.values():
        if action:
            actions_list.append(action)

    if actions_list:
        situation_action_pairs[idx] = situation, actions_list

print(
    f"After removing empty actions, now there is {len(situation_action_pairs)} "
    f"situation-action pairs.\n"
)

number of files: 10

processing `CPs exp 2a data/p01_interaction_patterns.xlsx` ...
There are in total of 14 situation-action pairs in the file CPs exp 2a data/p01_interaction_patterns.xlsx.

processing `CPs exp 2a data/p03_interaction_patterns.xlsx` ...
There are in total of 15 situation-action pairs in the file CPs exp 2a data/p03_interaction_patterns.xlsx.

processing `CPs exp 2a data/p04_interaction_patterns.xlsx` ...
There are in total of 13 situation-action pairs in the file CPs exp 2a data/p04_interaction_patterns.xlsx.

processing `CPs exp 2a data/p05_interaction_patterns.xlsx` ...
There are in total of 14 situation-action pairs in the file CPs exp 2a data/p05_interaction_patterns.xlsx.

processing `CPs exp 2a data/p06_interaction_patterns.xlsx` ...
There are in total of 13 situation-action pairs in the file CPs exp 2a data/p06_interaction_patterns.xlsx.

processing `CPs exp 2a data/p07_interaction_patterns.xlsx` ...
There are in total of 23 situation-action pairs in the file C

In [50]:
for situation, actions in situation_action_pairs:
    print(f"situation: {situation}")
    print(f"actions: {actions}")
    print()


situation: [('locations', 'top of rock pile'), ('objects', 'large rock'), ('locations', 'bottom of rock pile'), ('objects', 'large rock')]
actions: [[('agents', 'robot'), ('actions', 'move to object'), ('objects', 'large rock')], [('agents', 'human'), ('actions', 'move to object'), ('objects', 'large rock')], [('agents', 'robot'), ('actions', 'break object in location'), ('objects', 'large rock'), ('locations', 'right side of rock pile')], [('agents', 'robot'), ('actions', 'break object in location'), ('objects', 'large rock'), ('locations', 'bottom of rock pile')], [('agents', 'human'), ('actions', 'pick up object in location'), ('objects', 'small rock'), ('locations', 'top of rock pile')], [('agents', 'robot'), ('actions', 'pick up object in location'), ('objects', 'small rock'), ('locations', 'top of rock pile')]]

situation: [('counters', 'all'), ('objects', 'small rock'), ('locations', 'top of rock pile')]
actions: [[('agents', 'human'), ('actions', 'move back and forth in locatio

In [66]:
vocab  = {"text": set(), "class": set()}

for situation, actions in situation_action_pairs:
    for class_name, text_content in situation:
        vocab["text"].add(text_content)
        vocab["class"].add(class_name)

    for action in actions:
        for class_name, text_content in action:
            vocab["text"].add(text_content)
            vocab["class"].add(class_name)


In [63]:
for key, val in vocab.items():
    for v in val:
        print(v)

text
above rock pile
one
move to location
zero
break object in location
right side of rock pile
move to actor
robot
move back and forth in location
on top of location
drop object in location
move to object
human
stand still in location
left side of field
on top of object
top of rock pile
bottom of rock pile
left side of rock pile
all
brown rock
pick up object in location
on top of actor
small rock
victim
large rock
right side of field
class
locations
actions
counters
objects
agents


In [55]:
class_name

'locations'

In [54]:
vocab

{'above rock pile',
 'all',
 'bottom of rock pile',
 'break object in location',
 'brown rock',
 'drop object in location',
 'human',
 'large rock',
 'left side of field',
 'left side of rock pile',
 'move back and forth in location',
 'move to actor',
 'move to location',
 'move to object',
 'on top of actor',
 'on top of location',
 'on top of object',
 'one',
 'pick up object in location',
 'right side of field',
 'right side of rock pile',
 'robot',
 'small rock',
 'stand still in location',
 'top of rock pile',
 'victim',
 'zero'}