In [1]:
# FrameNetのデータの前処理でのデータ数の変化を確認する
from pathlib import Path
import pandas as pd
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

input_file = Path("../data/preprocessing/framenet/preprocess/original/exemplars.jsonl")

# 前処理後のFrameNetのデータを読み込む
df = pd.read_json(input_file, lines=True)


In [2]:
# LUの単語数について確認
print(f"FrameNet（前処理後）の用例数：{len(df)}")
print()

for i in range(1,6):
    lu = df[df["lu_name"].apply(lambda x: len(x.split())) == i]
    # lu = lu[["lu_name", "frame", "exemplar"]]
    print(f"{i}単語LUの用例数：{len(lu)}")
    value_counts = lu["lu_name"].value_counts()
    print(f"{i}単語LUの種類数：{len(value_counts)}")
    print(value_counts.head(3))
    print()

FrameNet（前処理後）の用例数：81270

1単語LUの用例数：78864
1単語LUの種類数：2465
lu_name
say.v     381
tell.v    333
use.v     281
Name: count, dtype: int64

2単語LUの用例数：2261
2単語LUの種類数：157
lu_name
take place.v    123
there be.v      120
had better.v     80
Name: count, dtype: int64

3単語LUの用例数：131
3単語LUの種類数：10
lu_name
take to task.v    25
do the trick.v    22
set on fire.v     20
Name: count, dtype: int64

4単語LUの用例数：12
4単語LUの種類数：4
lu_name
nip in the bud.v    8
come to a end.v     2
put an end to.v     1
Name: count, dtype: int64

5単語LUの用例数：2
5単語LUの種類数：1
lu_name
fly in the face of.v    2
Name: count, dtype: int64



In [19]:
# FrameNetの各verbについて確認
# verbでグループ化してサイズを計算
verb_sizes = df.groupby('verb').size().rename('verb_size')

# print(verb_sizes.mean())
# print(df.groupby(['verb', 'lu_name']).size().mean())

# verbとlu_nameでグループ化してサイズを計算
verb_lu_sizes = df.groupby(['verb', 'lu_name']).size().rename('lu_size').reset_index()


# verb_sizeをマージ
merged = verb_lu_sizes.merge(verb_sizes, on='verb')

# verb_sizeでソートし、同じサイズの場合はlu_sizeでソート
sorted_group = merged.sort_values(['verb_size', 'lu_size'], ascending=[False, False])

print(sorted_group.to_markdown())
print()
# sorted_group = merged.sort_values(['lu_size'], ascending=[False])
# print(sorted_group.to_markdown())

|      | verb             | lu_name              |   lu_size |   verb_size |
|-----:|:-----------------|:---------------------|----------:|------------:|
| 2356 | take             | take.v               |       163 |         424 |
| 2353 | take             | take place.v         |       123 |         424 |
| 2350 | take             | take off.v           |        75 |         424 |
| 2354 | take             | take to task.v       |        25 |         424 |
| 2347 | take             | take after.v         |        20 |         424 |
| 2351 | take             | take out.v           |        11 |         424 |
| 2348 | take             | take captive.v       |         3 |         424 |
| 2352 | take             | take part.v          |         2 |         424 |
| 2349 | take             | take effect.v        |         1 |         424 |
| 2355 | take             | take to the air.v    |         1 |         424 |
| 1969 | say              | say.v                |       381 |         381 |