In [1]:
from pathlib import Path
import json

input_path = Path("./output/flattend_koksl2022_ann.json")

with open(input_path, "r", encoding="utf-8") as file:
    input_records = json.loads(file.read())


In [2]:
import re

def get_left_wds(target_wd: str, ksl_snt: str):
    ksl_wds = ksl_snt.split("/")
    ksl_wds_norm = [re.sub(r"\[.+?\]$", "", wd) for wd in ksl_wds]

    target_idxs = (i for i, wd in enumerate(ksl_wds_norm) if wd == target_wd)

    def get_left_wd(idx: int):
        if idx < 1:
            return "Init"
        else:
            return ksl_wds[idx - 1]
    return [get_left_wd(i) for i in target_idxs]

left_wds = {
    "아니다": [],
    "없다": [],
    "않다": [],
    "안하다": [],
}

for record in input_records:
    ksl_snt = record["ksl_simple"]
    left_wds["아니다"] += get_left_wds("아니다", ksl_snt)
    left_wds["없다"] += get_left_wds("없다", ksl_snt)
    left_wds["않다"] += get_left_wds("않다", ksl_snt)
    left_wds["안하다"] += get_left_wds("안하다", ksl_snt)

left_wds["아니다"]

['냉동',
 'Init',
 '화면',
 '문제',
 'Init',
 'Init',
 'Init',
 '담당',
 'Init',
 '사다',
 '어렵다',
 '고장(결함)',
 '고장(결함)',
 'Init',
 '오르다',
 'Init',
 '에어컨',
 '세우다',
 '들어(상태)오다',
 '들어(상태)오다',
 'Init',
 'Init',
 '전화번호',
 '#완산구',
 '달라',
 '오다',
 '불량',
 '구입',
 '통세탁기',
 'Init',
 'Init',
 'Init',
 'Init',
 'Init',
 'Init',
 '사다',
 'Init',
 'Init',
 'Init',
 '다르다',
 '문제',
 '들리다',
 '켜지다',
 'Init',
 '통돌이',
 '담당',
 '100 200만원',
 '이사(행동)',
 '사용',
 '무료(공짜)',
 '늦다',
 'Init',
 '작다',
 '말(대화)주다',
 '잘못',
 'Init',
 '원하다',
 '동의',
 '꺾이다',
 'Init',
 '사용',
 '나오다',
 '맞다(옳다)',
 '2',
 '어렵다',
 'Init',
 'Init',
 '멈추다',
 'Init',
 'Init',
 'Init',
 'Init',
 '오래',
 '들어가다',
 'Init',
 '곳',
 'Init',
 'Init',
 '원하다',
 '원하다',
 'Init',
 '앞 열다',
 '모두',
 '다',
 '멀다',
 'Init',
 '지금',
 'Init',
 '담당',
 'Init',
 'Init',
 '같다(추측)',
 '버튼누르다',
 'Init',
 '내거',
 '묻다',
 '그것',
 'Init',
 '전(시간)',
 '대로(어미)',
 '현금',
 'Init',
 'Init',
 'Init',
 'Init',
 'Init',
 '새(신)거',
 'Init',
 '사용',
 'Init',
 '들어(상태)가다',
 '담당',
 '충분',
 '결제',
 '물건',
 'Init',
 'Init'

In [8]:
left_wd_set = {
    *left_wds["아니다"], 
    *left_wds["없다"], 
    *left_wds["않다"], 
    *left_wds["안하다"],
    }

def get_count_by_neg(wd):
    num_ani = left_wds["아니다"].count(wd)
    num_eps = left_wds["없다"].count(wd)
    num_anh = left_wds["않다"].count(wd)
    num_anha = left_wds["안하다"].count(wd)
    return (
        wd,
        num_ani,
        num_eps,
        num_anh,
        num_anha
    )

left_wd_count = [get_count_by_neg(wd) for wd in left_wd_set if not wd == "Init"]
left_wd_count = sorted(left_wd_count, key=lambda wd: wd[1], reverse=True)
left_wd_count


[('다르다', 169, 47, 0, 0),
 ('좋다', 164, 7, 23, 0),
 ('심하다', 97, 21, 5, 0),
 ('많다', 68, 15, 3, 1),
 ('먹다', 66, 103, 49, 4),
 ('주다', 59, 99, 3, 4),
 ('맞다(옳다)', 58, 5, 0, 0),
 ('문제', 55, 319, 0, 0),
 ('뿐', 47, 2, 0, 0),
 ('나쁘다', 47, 34, 6, 0),
 ('있다', 46, 217, 0, 0),
 ('쉽다', 41, 1, 14, 0),
 ('담당', 41, 1, 0, 0),
 ('아프다', 40, 170, 7, 0),
 ('오래', 36, 1, 0, 0),
 ('회원', 35, 40, 0, 0),
 ('암', 32, 1, 0, 0),
 ('원하다', 32, 3, 10, 0),
 ('정도', 31, 1, 0, 0),
 ('필요', 30, 106, 0, 1),
 ('오다', 29, 16, 5, 1),
 ('곳', 28, 61, 0, 0),
 ('크다', 23, 22, 1, 1),
 ('수술', 23, 17, 0, 6),
 ('사람', 22, 88, 0, 0),
 ('가능', 21, 4, 0, 0),
 ('되다', 21, 3, 0, 0),
 ('특별', 20, 60, 0, 0),
 ('병(질병)', 20, 10, 0, 0),
 ('끝', 19, 2, 0, 0),
 ('어렵다', 18, 28, 4, 0),
 ('높다', 17, 1, 4, 0),
 ('여기', 17, 2, 0, 0),
 ('길다', 17, 1, 1, 0),
 ('그것', 17, 3, 0, 0),
 ('가다', 17, 4, 11, 2),
 ('꼭', 16, 2, 0, 0),
 ('포함', 16, 24, 0, 0),
 ('다름', 15, 2, 0, 0),
 ('일(업무)', 15, 48, 0, 2),
 ('목적', 15, 6, 0, 0),
 ('자주(빈번)', 15, 2, 0, 0),
 ('검사(검진)', 14, 23, 0, 1),
 

In [9]:
from openpyxl import Workbook

wb = Workbook()
sheet = wb["Sheet"]

header = ("left_word", "with_ani", "with_eps", "with_anh", "with_anha")

for row in [header, *left_wd_count]:
    sheet.append(row)

wb.save("./output/left_word_count.xlsx")
