## Regex to label for KoBERT_01
- 각 regex별로 적용시켜야 하는 labeling이 다르기 때문에 항목별로 나누어서 진행.
- 해당 노트에서는 patients, medical staff 항목을 다룸.

In [1]:
import os
import copy
import re

import pandas as pd

In [2]:
dat = pd.read_csv('phis.csv', encoding='utf-8')
label = ["'PER-B'", "'PER-I'", "'DAT-B'", "'DAT-I'", "'ORG-B'", 
         "'ORG-I'", "'LOC-B'", "'LOC-I'", "'NUM-B'", "'NUM-I'", "'ETC-B'", "'ETC-I'"]

### Patients

In [3]:
notes={}

for i in range(len(dat)):
    notes[dat['note_id'][i]] = dat['note_text'][i]

save_origins = copy.deepcopy(notes)

In [4]:
regex = 'filters/regex/name_patients/patients.txt'

with open(regex) as r:
    patterns = re.compile(r.readline())

1. 노트 정보를 불러와서 NAME_PATIENTS 패턴과 매치되는 문자열을 찾는다.
2. 해당 문자열을 split()등을 이용해 'O'와 'PER-B'로 완전히 바꾼다.
  - 만약 띄어쓰는 환자 이름이 있다면 유효하지 않게 됨.
3. 2번에서 바꾼 string에 대해 'PER-B'라면 그대로 저장하고 그렇지 않으면 'O'로 바꾼다.
4. 검수
5. original string과 label을 저장한다.
6. 다음 카테고리에서는 5번의 string과 label을 가지고 1번부터 작업한다. 

In [5]:
def convert_pname(matchObj):
    match = matchObj.group()
    convertString = " 'O'" * (len(match.split())-1) + " 'PER-B'"
    return convertString

In [6]:
import pandas as pd

In [7]:
transformed = pd.DataFrame(columns = ['idx', 'origin', 'label'])

In [8]:
i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_pname, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed.loc[i] = [k, notes[k], convNote]
    i += 1

---

### Medical Staff
- 이 항목은 전부 labeling 처리가 달라서 어쩔 수 없이 txt 단위로 일일이 처리.

In [9]:
# after loading file like dat = pd.read_csv('lb.csv', encoding='utf-8')
# transformed['label'] = dat

dat = transformed

notes={}

for i in range(len(dat)):
    notes[dat['idx'][i]] = dat['label'][i]


In [10]:
regex = './filters/regex/name_medicalStaff/PF_NAME_kor_transformed.txt'
"""
[(P|p)(F|f)](.)?\s?[가-힣]{2,}
"""

with open(regex) as r:
    patterns = re.compile(r.readline())
    
def convert_staffname01(matchObj):
    match = matchObj.group()
    convertString = "'O' " * (len(match.split())-1) + "'PER-B'"
    return convertString

In [11]:
transformed = pd.DataFrame(columns = ['idx', 'origin', 'label'])

i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_staffname01, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed.loc[i] = [k, save_origins[k], convNote]
    i += 1

In [12]:
#한번더..ㅎㅎ;
dat = copy.deepcopy(transformed)

notes={}

for i in range(len(dat)):
    notes[dat['idx'][i]] = dat['label'][i]

In [13]:
regex = './filters/regex/name_medicalStaff/confirmed_01_medical_staff_transformed.txt'
"""
확인판독\s[가-힣]{2,}(([,&/\s]?[가-힣]{2,}){1,})?|판독의\s[가-힣]{2,}(([,&/\s]?[가-힣]{2,}){1,})?
2명 이상의 medical staff name이 포함되어 있을 수 있음
"""

with open(regex) as r:
    patterns = re.compile(r.readline())
    #re.compile(r.readline())
    
def convert_staffname02(matchObj):
    match = matchObj.group()
    convertString = "'O'" + " 'PER-B'" * (len(match.split())-1)
    return convertString

In [14]:
transformed2 = pd.DataFrame(columns = ['idx', 'origin', 'label'])

i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_staffname02, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed2.loc[i] = [k, save_origins[k], convNote]
    i += 1

### 중간 메모 
- `판독의 : 홍길동` 이런 표현도 존재
- 몇 개의 표현식은 space여부를 지움(홍길동\s?선생님 -> 홍길동\s선생님)

In [15]:
dat = copy.deepcopy(transformed2)

notes={}

for i in range(len(dat)):
    notes[dat['idx'][i]] = dat['label'][i]

In [16]:
regex = './filters/regex/name_medicalStaff/confirmed_02_medical_staff_transformed.txt'
"""
확인함 by [가-힣]{2,}(([,&/\s]?[가-힣]{2,}){1,})?
2명 이상의 medical staff name이 포함되어 있을 수 있음
"""

with open(regex) as r:
    patterns = re.compile(r.readline())
    #re.compile(r.readline())
    
def convert_staffname03(matchObj):
    match = matchObj.group()
    convertString = "'O' 'O'" + " 'PER-B'" * (len(match.split())-2)
    return convertString

In [17]:
transformed3 = pd.DataFrame(columns = ['idx', 'origin', 'label'])

i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_staffname03, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed3.loc[i] = [k, save_origins[k], convNote]
    i += 1

## 중간메모
- 확인함 패턴은 한명만 쓴다.
- 확인함 by 홍길동 이런식으로 잡는 것으로 변경함

In [18]:
dat = copy.deepcopy(transformed3)

notes={}

for i in range(len(dat)):
    notes[dat['idx'][i]] = dat['label'][i]

In [19]:
regex = './filters/regex/name_medicalStaff/confirmed_03_medical_staff_transformed.txt'
"""
판독의\d? : [가-힣]{2,}(([,&/\s]?[가-힣]{2,}){1,})?
2명 이상의 medical staff name이 포함되어 있을 수 있음
"""

with open(regex) as r:
    patterns = re.compile(r.readline())
    #re.compile(r.readline())
    
def convert_staffname03(matchObj):
    match = matchObj.group()
    convertString = "'O' 'O'" + " 'PER-B'" * (len(match.split())-2)
    return convertString

In [20]:
transformed4 = pd.DataFrame(columns = ['idx', 'origin', 'label'])

i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_staffname03, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed4.loc[i] = [k, save_origins[k], convNote]
    i += 1

In [21]:
dat = copy.deepcopy(transformed4)

notes={}

for i in range(len(dat)):
    notes[dat['idx'][i]] = dat['label'][i]

In [22]:
regex = './filters/regex/name_medicalStaff/salutations_transformed.txt'
"""
[가-힣]{2,}(([,&/\s]?[가-힣]{2,}){1,})?\s(선생님|간호사|교수님)
2명 이상의 medical staff name이 포함되어 있을 수 있음
"""

with open(regex) as r:
    patterns = re.compile(r.readline())
    #re.compile(r.readline())
    
def convert_staffname04(matchObj):
    match = matchObj.group()
    convertString = "'PER-B'" + " 'O'"
    return convertString

In [23]:
transformed5 = pd.DataFrame(columns = ['idx', 'origin', 'label'])

i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_staffname04, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed5.loc[i] = [k, save_origins[k], convNote]
    i += 1

In [24]:
dat = copy.deepcopy(transformed5)

notes={}

for i in range(len(dat)):
    notes[dat['idx'][i]] = dat['label'][i]

In [25]:
regex = './filters/regex/name_medicalStaff/prepos_NAME_kor_transformed.txt'
"""
(By|BY|by|from|From|FROM)(.)?\s[가-힣]{2,}
# (([,&/\s]?[가-힣]{2,}){1,})?
2명 이상의 medical staff name이 포함되어 있을 수 있음
"""

with open(regex) as r:
    patterns = re.compile(r.readline())
    #re.compile(r.readline())
    
def convert_staffname05(matchObj):
    match = matchObj.group()
    convertString = "'O'" + " 'PER-B'"
    return convertString

In [26]:
transformed6 = pd.DataFrame(columns = ['idx', 'origin', 'label'])

i=0
for k in notes.keys():
    convNote = re.sub(patterns, convert_staffname05, notes[k])
    newNotes = [] #label정보를 담고 있음
    for words in convNote.split():
        if words in label:
            newNotes.append(words)
        else:
            newNotes.append("'O'")    
#     for i in range(len(newNotes)-1):
#         print(notes[notes].split()[i] + '\t' + newNotes[i])
    transformed6.loc[i] = [k, save_origins[k], convNote]
    i += 1

In [27]:
transformed6.to_csv('./PHI_tagging/lb_test.csv', index=False)