In [1]:
# FrameNetのデータの前処理でのデータ数の変化を確認する
from pathlib import Path
import pandas as pd
import re
from nltk.corpus import framenet
import xml.etree.ElementTree as ET 
import glob

pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

input_file = Path("../data/preprocessing/framenet/preprocess/original/exemplars.jsonl")

# 前処理後のFrameNetのデータを読み込む
df = pd.read_json(input_file, lines=True)


In [2]:
def check_multi_word_lu(text):
    # LUの単語数について確認
    print(f"FrameNet（前処理後）の用例数：{len(df)}")
    print()

    for i in range(1,6):
        lu = df[df["lu_name"].apply(lambda x: len(x.split())) == i]
        # lu = lu[["lu_name", "frame", "exemplar"]]
        print(f"{i}単語LUの用例数：{len(lu)}")
        value_counts = lu["lu_name"].value_counts()
        print(f"{i}単語LUの種類数：{len(value_counts)}")
        print(value_counts.head(3))
        print()

In [None]:
def check_verb_lu():
    # FrameNetの各verbについて確認
    # verbでグループ化してサイズを計算
    verb_sizes = df.groupby('verb').size().rename('verb_size')

    # print(verb_sizes.mean())
    # print(df.groupby(['verb', 'lu_name']).size().mean())

    # verbとlu_nameでグループ化してサイズを計算
    verb_lu_sizes = df.groupby(['verb', 'lu_name']).size().rename('lu_size').reset_index()


    # verb_sizeをマージ
    merged = verb_lu_sizes.merge(verb_sizes, on='verb')

    # verb_sizeでソートし、同じサイズの場合はlu_sizeでソート
    sorted_group = merged.sort_values(['verb_size', 'lu_size'], ascending=[False, False])

    print(sorted_group.to_markdown())
    print()
    # sorted_group = merged.sort_values(['lu_size'], ascending=[False])
    # print(sorted_group.to_markdown())
# check_verb_lu()

|      | verb             | lu_name              |   lu_size |   verb_size |
|-----:|:-----------------|:---------------------|----------:|------------:|
| 2356 | take             | take.v               |       163 |         424 |
| 2353 | take             | take place.v         |       123 |         424 |
| 2350 | take             | take off.v           |        75 |         424 |
| 2354 | take             | take to task.v       |        25 |         424 |
| 2347 | take             | take after.v         |        20 |         424 |
| 2351 | take             | take out.v           |        11 |         424 |
| 2348 | take             | take captive.v       |         3 |         424 |
| 2352 | take             | take part.v          |         2 |         424 |
| 2349 | take             | take effect.v        |         1 |         424 |
| 2355 | take             | take to the air.v    |         1 |         424 |
| 1969 | say              | say.v                |       381 |         381 |

In [16]:
def check_polysemous_lu():
    # LUの多義性について確認
    lu_frame_counts = df.groupby('lu_name')['frame_name'].nunique()
    polysemous_lu = lu_frame_counts[lu_frame_counts > 1].sort_values()
    print(f"多義性のあるLU数：{len(polysemous_lu)}")
    for lu_name in polysemous_lu.index:
        frames = df[df['lu_name'] == lu_name]['frame_name'].unique()
        print(f"{lu_name.ljust(20)}: {len(frames)} frames -> {frames}")
check_polysemous_lu()

多義性のあるLU数：750
zigzag.v            : 2 frames -> ['Motion' 'Path_shape']
stalk.v             : 2 frames -> ['Cotheme' 'Self_motion']
judge.v             : 2 frames -> ['Assessing' 'Legal_rulings']
jab.v               : 2 frames -> ['Cause_harm' 'Cause_impact']
issue.v             : 2 frames -> ['Creating' 'Emanating']
steal.v             : 2 frames -> ['Self_motion' 'Theft']
invent.v            : 2 frames -> ['Achieving_first' 'Coming_up_with']
intermix.v          : 2 frames -> ['Amalgamation' 'Cause_to_amalgamate']
instruct.v          : 2 frames -> ['Education_teaching' 'Request']
install.v           : 2 frames -> ['Change_of_leadership' 'Installing']
injure.v            : 2 frames -> ['Cause_harm' 'Experience_bodily_harm']
stew.v              : 2 frames -> ['Apply_heat' 'Emotion_heat']
inform.v            : 2 frames -> ['Reporting' 'Telling']
sting.v             : 2 frames -> ['Experiencer_obj' 'Perception_body']
kid.v               : 2 frames -> ['Giving_birth' 'Prevarication']
induc

In [4]:
def check_corpus():
    ns = {'fn': 'http://framenet.icsi.berkeley.edu'}
    corpus_df = pd.DataFrame(columns=["ID","name","description"])

    tree = ET.parse("/data/data/fndata-1.7/lu/lu2.xml")
    root = tree.getroot()
    corpuses = root.findall("fn:header/fn:corpus",ns)
    corpus_df = pd.concat([corpus_df,pd.DataFrame([corpus.attrib for corpus in corpuses],columns=["ID","name","description"])]).reset_index(drop=True)
    print(corpus_df)


    sentence_df = pd.DataFrame(columns=["corpID","annotationSet_ID","cDate","name","text"])
    name = root.attrib["name"]
    sentences = root.findall("fn:subCorpus/fn:sentence",ns)
    for sentence in sentences:
        row = {}
        row["name"] = name
        row["corpID"] = sentence.attrib["corpID"] if "corpID" in sentence.attrib else -1
        row["text"] = sentence.find("fn:text",ns).text
        annotationSets = sentence.findall("fn:annotationSet",ns)
        for annotationSet in annotationSets:
            layers = annotationSet.findall("fn:layer",ns)
            for layer in layers:
                if layer.attrib["name"] == "Target":
                    row["annotationSet_ID"] = annotationSet.attrib["ID"]
                    row["cDate"] = annotationSet.attrib["cDate"]
                    sentence_df = pd.concat([sentence_df,pd.DataFrame([row])]).reset_index(drop=True)
                    break
    print(sentence_df)