In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# pandas の表示オプションを全表示に設定
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# データフォルダのパス
data_folder = 'edit_data'

# データフォルダ内のすべてのCSVファイルを取得
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

df = pd.DataFrame()

# 各CSVファイルを処理
for file_name in csv_files:
    file_path = os.path.join(data_folder, file_name)
    temp_df = pd.read_csv(file_path)
    df = pd.concat([df, temp_df], ignore_index=True)
    
# 参加者の数を確認
num_participants = df['participant_id'].nunique()
print(f'被験者数: {num_participants}')

被験者数: 30


In [3]:
df_F = df[(df['TorF'] == 'F') & (df['input_text'].notna()) & (df['angle'] != 0)].copy()

# scaleの順序を指定してカテゴリ化
scale_order = ['intended', '50cm', '75cm', '100cm']
df_F['scale'] = pd.Categorical(df_F['scale'], categories=scale_order, ordered=True)

# angle, period を数値に変換（必要に応じて）
df_F['angle'] = pd.to_numeric(df_F['angle'], errors='coerce')
df_F['period'] = pd.to_numeric(df_F['period'], errors='coerce')
df_F['line_width'] = pd.to_numeric(df_F['line_width'], errors='coerce')

# 指定順（scale -> angle昇順 -> period昇順）でソート
df_F = df_F.sort_values(by=['scale', 'angle', 'period', 'line_width']).reset_index(drop=True)

# input_text を整数表示に変換（欠損値がある場合でも表示できるよう nullable Int64 を使用）
df_F['input_text'] = pd.to_numeric(df_F['input_text'], errors='coerce')
if df_F['input_text'].isna().any():
    df_F['input_text'] = df_F['input_text'].astype('Int64')  # pandas nullable integer dtype
else:
    df_F['input_text'] = df_F['input_text'].astype(int)

In [17]:
import difflib

# 比較対象の列名
reference_col = 'text' 

def levenshtein(a: str, b: str) -> int:
    a = a or ""
    b = b or ""
    n, m = len(a), len(b)
    if n == 0:
        return m
    if m == 0:
        return n
    dp = list(range(m + 1))
    for i in range(1, n + 1):
        prev, dp[0] = dp[0], i
        for j in range(1, m + 1):
            cur = dp[j]
            if a[i - 1] == b[j - 1]:
                dp[j] = prev
            else:
                dp[j] = 1 + min(prev, dp[j], dp[j - 1])
            prev = cur
    return dp[m]

def label_miss(ref, hyp):
    if pd.isna(ref) or pd.isna(hyp):
        return None
    ref = str(ref)
    hyp = str(hyp)
    sm = difflib.SequenceMatcher(None, ref, hyp)
    ops = sm.get_opcodes()
    parts = []
    for tag, i1, i2, j1, j2 in ops:
        if tag == 'equal':
            continue
        ref_seg = ref[i1:i2]
        hyp_seg = hyp[j1:j2]
        if tag == 'replace':
            parts.append(f"replace: {ref_seg} -> {hyp_seg}")
        elif tag == 'delete':
            parts.append(f"delete: {ref_seg}")
        else:  # insert
            parts.append(f"insert: {hyp_seg}")
    dist = levenshtein(ref, hyp)
    summary = f""
    if parts:
        summary += "; ".join(parts)
    return summary

# df に miss 列を追加
if reference_col in df_F.columns:
    df_F['miss'] = df_F.apply(lambda r: label_miss(r[reference_col], r.get('input_text')), axis=1)
else:
    # reference_col がない場合は通知用の文字列を入れる（必要なら先に列を作成してください）
    df_F['miss'] = None

#display(df_F[['participant_id', 'scale', 'angle', 'period', 'line_width','text', 'input_text','miss']])

In [24]:
# angle 別に miss をカウント
angle_miss = df_F.groupby('angle')['miss'].value_counts(dropna=False).rename('count').reset_index()
angle_miss = angle_miss.sort_values(['angle', 'count'], ascending=[True, False]).reset_index(drop=True)


#countが5以上のものだけ抽出
angle_miss_filtered = angle_miss[angle_miss['count'] >= 1]

#各 angleごとのmissのTOP10を表示
for angle in angle_miss_filtered['angle'].unique():
    print(f"Angle: {angle}")
    top_miss = angle_miss_filtered[angle_miss_filtered['angle'] == angle].head(10)
    display(top_miss[['miss', 'count']])
    
    #csv出力
    output_filename = f'angle_{angle}_miss.csv'
    top_miss[['angle', 'miss', 'count']].to_csv(output_filename, index=False)

Angle: 45.0


Unnamed: 0,miss,count
0,replace: 5 -> 6,8
1,replace: 0 -> 6,6
2,replace: 0 -> 8,6
3,replace: 0 -> 9,6
4,replace: 3 -> 2,6
5,replace: 9 -> 5,6
6,replace: 8 -> 9,5
7,replace: 6 -> 5,4
8,replace: 8 -> 2,4
9,replace: 8 -> 5,4


Angle: 90.0


Unnamed: 0,miss,count
110,replace: 6 -> 5,37
111,replace: 5 -> 6,25
112,replace: 8 -> 3,25
113,replace: 8 -> 6,23
114,replace: 1 -> 0,19
115,replace: 3 -> 8,18
116,replace: 5 -> 3,17
117,replace: 9 -> 3,17
118,replace: 6 -> 3,13
119,replace: 8 -> 5,12


Angle: 135.0


Unnamed: 0,miss,count
441,replace: 6 -> 5,25
442,replace: 8 -> 6,19
443,replace: 9 -> 5,18
444,replace: 5 -> 8,17
445,replace: 6 -> 8,17
446,replace: 8 -> 5,15
447,replace: 9 -> 8,12
448,replace: 1 -> 3,11
449,replace: 8 -> 9,11
450,replace: 7 -> 1,9
