In [None]:
import benepar, spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("The time for action is now. It's never too late to do something.")

if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

In [None]:
def find_root_verb_and_its_dobj(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.pos_ == "VERB":
        for child in tree_root.children:
            if child.dep_ == "dobj" and child.pos_ == "NOUN":
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj(child)
    # if no children satisfy the condition, return None
    return None, None

def find_root_verb_and_its_dobj_in_string(s):
    doc = nlp(s)
    first_sent = list(doc.sents)[0]
    return find_root_verb_and_its_dobj(first_sent.root)

In [None]:
import os
import pandas as pd
import json
from tqdm import tqdm

generated_data_path = "/mnt/16t/oyx/EasyInstruct/data/evol_instruct_5k.jsonl"
if generated_data_path.endswith(".jsonl"):
    machine_generated_tasks = [json.loads(l) for l in open(generated_data_path, "r")]
elif generated_data_path.endswith(".json"):
    machine_generated_tasks = json.load(open(generated_data_path, "r"))
else:
    raise ValueError("Unknown file format")

def process_instruction(instruction):
    try:
        verb, noun = find_root_verb_and_its_dobj_in_string(instruction)
        return {
            "verb": verb,
            "noun": noun,
            "instruction": instruction
        }
    except Exception as e:
        print(e)
        print(instruction)
        return None
    
raw_phrases_file = os.path.splitext(os.path.basename(generated_data_path))[0] + "_raw_phrases.json"

if os.path.exists(raw_phrases_file):
    print("Raw phrases file already exists, skip processing")
else:
    instructions = set([task["instruction"] for task in machine_generated_tasks])
    raw_phrases = []

    for instruction in tqdm(instructions):
        raw_phrases.append(process_instruction(instruction))

    with open(raw_phrases_file, "w") as f:
        json.dump(raw_phrases, f, indent=4)

In [None]:
raw_phrases = json.load(open(raw_phrases_file, "r"))
raw_phrases = [p for p in raw_phrases if p is not None]
raw_phrases = pd.DataFrame(raw_phrases)
phrases = pd.DataFrame(raw_phrases).dropna()
phrases[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)

In [None]:
top_verbs = phrases[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()

df = phrases[phrases["verb"].isin(top_verbs["verb"].tolist())]
# df = df[~df["noun"].isin(["I", "what"])]
# df = phrases
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
df = df.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
# df = df[df["count"] > 10]
df = df.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)
df

In [None]:
import plotly.graph_objects as go
import plotly.express as px

# df["blank"] = "ROOT"
# df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})

df = df[df["count"] > 5]
fig = px.sunburst(df, path=['verb', 'noun'], values='count')
# fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
fig.update_layout(
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Times New Roman",
)
fig.show()

verb_noun_file = os.path.splitext(os.path.basename(generated_data_path))[0] + "_verb_noun.html"
fig.write_html(verb_noun_file)