# Data Cleansing 

* 라벨링 제거

## Data 불러오기

In [1]:
import pandas as pd

In [60]:
data = pd.read_csv('rawdata/lung_cancer.csv')
data.head()

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL,Inclusion Criteria,Exclusion Criteria
0,17,NCT01376856,Characteristics of Mediastinal Lymph Node With...,,Terminated,No Results Available,Lung Cancer,,false positive rate of Mediastinal Lymph Node,Yonsei University,...,Nov.11,Nov.11,20.Jun.11,,4.Jun.13,"Severance Hospital, Seoul, Korea, Republic of",,https://ClinicalTrials.gov/show/NCT01376856,"1. age 20-75, men and women\n2. person who per...",1. age < 20\n2. person who don't agree with en...
1,19,NCT01310387,Prospective Study of Active Pain Management in...,APM,Completed,No Results Available,Lung Cancer,Behavioral: Active pain management,The percent of pain intensity difference|Patie...,Chonnam National University Hospital|Janssen K...,...,Jan.12,Jun.12,8.Mar.11,,5.Feb.16,"Chonnam National University Hwasun Hospital, J...",,https://ClinicalTrials.gov/show/NCT01310387,1. Outpatients with lung cancer.\n2. The patie...,1. Drug or alcohol abusers.\n2. Child-bearing ...
2,27,NCT03394703,Korean Lung Cancer Screening Project,K-LUCAS,Unknown status,No Results Available,Lung Cancer,,Early stage lung cancer detection rate|False p...,"National Cancer Center, Korea|Korean Associati...",...,31.Dec.17,31.Dec.18,9.Jan.18,,9.Jan.18,"National Cancer Center, Goyang, Gyeonggi, Kore...",,https://ClinicalTrials.gov/show/NCT03394703,Inclusion Criteria (1)\n1. Age : 55-74 years o...,1. Lung cancer diagnosed and treated\n2. Inabi...
3,46,NCT01441297,BIBF 1120 as Second Line Treatment for Small C...,,Completed,No Results Available,Small Cell Lung Cancer|Small Cell Lung Cancer ...,Drug: BIBF 1120,Overall response rate|Overall survival rate|Pr...,"Ji-youn Han|National Cancer Center, Korea",...,31.Oct.15,31.Mar.16,27.Sep.11,,25.Aug.17,"National Cancer Center, Goyang-si, Gyeonggi-do...",,https://ClinicalTrials.gov/show/NCT01441297,1. Histologically confirmed SCLC\n2. Progressi...,1. Previous therapy with other VGFR inhibitors...
4,59,NCT00736814,First-Line Combination Chemotherapy in Treatin...,,Unknown status,No Results Available,Lung Cancer,Drug: carboplatin|Drug: docetaxel|Drug: gemcit...,Response rate (complete and partial responses)...,Yonsei University|National Cancer Institute (NCI),...,,,18.Aug.08,,24.Feb.11,Yonsei Cancer Center at Yonsei University Medi...,,https://ClinicalTrials.gov/show/NCT00736814,DISEASE CHARACTERISTICS:\n1. Histologically pr...,


In [61]:
print(f'데이터 갯수 {data.shape[0]}')

데이터 갯수 459


## Criteria 형식 변경

> ordered list를 개행문자 단위로 쪼개기

* NaN 예외처리 => 빈 문자열

* 아래의 예시 상황의 경우 하나의 문장으로 패턴 일치화

```
DISEASE CHARACTERISTICS:
1. Histologically confirmed invasive breast cancer
1) Stage II or III disease
2) No evidence of metastasis (M0)
3) No inflammatory breast cancer (T4d)
```

* 아래의 예시 상황의 경우 문장에서 제외 (미반영)

```
DISEASE CHARACTERISTICS:
1. Histologically confirmed invasive breast cancer
```

In [94]:
creteria = data[['Inclusion Criteria', 'Exclusion Criteria']]
print(f'데이터 갯수 {creteria.shape[0]}')

# NaN 빈 텍스트 치환
creteria.fillna(value='')

creteria

데이터 갯수 459


Unnamed: 0,Inclusion Criteria,Exclusion Criteria
0,"1. age 20-75, men and women\n2. person who per...",1. age < 20\n2. person who don't agree with en...
1,1. Outpatients with lung cancer.\n2. The patie...,1. Drug or alcohol abusers.\n2. Child-bearing ...
2,Inclusion Criteria (1)\n1. Age : 55-74 years o...,1. Lung cancer diagnosed and treated\n2. Inabi...
3,1. Histologically confirmed SCLC\n2. Progressi...,1. Previous therapy with other VGFR inhibitors...
4,DISEASE CHARACTERISTICS:\n1. Histologically pr...,
...,...,...
454,1. Pathologically confirmed stage IIIB/IV aden...,1. More than two prior cytotoxic chemotherapy ...
455,1. Male or female patients aged 20 years or ol...,"1. Prior systemic chemotherapy, immunotherapy ..."
456,"1. Eligible for, or on active study drug treat...",1. History of hypersensitivity to the active s...
457,1. Pathologically confirmed diagnosis of Stage...,1. Prior chemotherapy for relapsed and/or meta...


### 정규식 및 텍스트 처리

1. 정규식

    * `숫자.` 을 기준으로 split

    * 좌측 공백 제거, 우측 공백 유지(\n)

2. 텍스트 처리

    * 좌측 공백/개행 제거, 우측 개행 유지(\n)

    * 이후 한 문장으로 변환

In [90]:
import re

In [91]:
# 예시
text = '''
DISEASE CHARACTERISTICS:
1. Histologically confirmed invasive breast cancer
1) Stage II or III disease
2) No evidence of metastasis (M0)
2. Must have a primary tumor
3. Operable disease
4. Triple-negative disease, meeting the following criteria:
1) Estrogen receptor-, progesterone receptor-, and HER2-negative by immunohistochemistry (IHC) 0 or 1+ OR fluorescence in situ hybridization negative (in case IHC is 2+)
PATIENT CHARACTERISTICS:
5. ECOG performance status 0-1
'''
# 문장 구분
print(list(map(lambda x: x.lstrip(), re.split('\d[.]', text))))

# Text 변환
print(''.join(map(lambda x: x.lstrip(), re.split('\d[.]', text))))

['DISEASE CHARACTERISTICS:\n', 'Histologically confirmed invasive breast cancer\n1) Stage II or III disease\n2) No evidence of metastasis (M0)\n', 'Must have a primary tumor\n', 'Operable disease\n', 'Triple-negative disease, meeting the following criteria:\n1) Estrogen receptor-, progesterone receptor-, and HER2-negative by immunohistochemistry (IHC) 0 or 1+ OR fluorescence in situ hybridization negative (in case IHC is 2+)\nPATIENT CHARACTERISTICS:\n', 'ECOG performance status 0-1\n']
DISEASE CHARACTERISTICS:
Histologically confirmed invasive breast cancer
1) Stage II or III disease
2) No evidence of metastasis (M0)
Must have a primary tumor
Operable disease
Triple-negative disease, meeting the following criteria:
1) Estrogen receptor-, progesterone receptor-, and HER2-negative by immunohistochemistry (IHC) 0 or 1+ OR fluorescence in situ hybridization negative (in case IHC is 2+)
PATIENT CHARACTERISTICS:
ECOG performance status 0-1



In [107]:
def parse_text(text):
    if pd.isna(text):
        return ''
    return ''.join(map(lambda x: x.lstrip(), re.split('\d[.]', text)))

In [111]:
clean_creteria = creteria.applymap(parse_text)
print(creteria.shape[0])
clean_creteria

459


Unnamed: 0,Inclusion Criteria,Exclusion Criteria
0,"age 20-75, men and women\nperson who performed...",age < 20\nperson who don't agree with enrollme...
1,Outpatients with lung cancer.\nThe patients wi...,Drug or alcohol abusers.\nChild-bearing women ...
2,Inclusion Criteria (1)\nAge : 55-74 years old\...,Lung cancer diagnosed and treated\nInability t...
3,Histologically confirmed SCLC\nProgression dur...,Previous therapy with other VGFR inhibitors (o...
4,DISEASE CHARACTERISTICS:\nHistologically prove...,
...,...,...
454,Pathologically confirmed stage IIIB/IV adenoca...,More than two prior cytotoxic chemotherapy tre...
455,Male or female patients aged 20 years or older...,"Prior systemic chemotherapy, immunotherapy or ..."
456,"Eligible for, or on active study drug treatmen...",History of hypersensitivity to the active subs...
457,Pathologically confirmed diagnosis of Stage II...,Prior chemotherapy for relapsed and/or metasta...


## Export csv

In [113]:
clean_creteria.to_csv('result/cleaned_lung_cancer.csv')