# 의약품 처방정보 

In [2]:
# 라이브러리 로드
import numpy as np
import pandas as pd

In [3]:
from glob import glob

file_name = sorted(glob("data/HP*.CSV"))
file_name

['data/HP_T60_2020_1.CSV', 'data/HP_T60_2020_2.CSV', 'data/HP_T60_2020_3.CSV']

In [4]:
raw = pd.read_csv(file_name[2], encoding="cp949")
raw.shape

(10923225, 15)

In [5]:
raw.head()

Unnamed: 0,기준년도,가입자 일련번호,처방내역일련번호,일련번호,성별코드,연령대코드(5세단위),시도코드,요양개시일자,약품일반성분명코드,1회 투약량,1일투약량,총투여일수,단가,금액,데이터 공개일자
0,2020,666668,5260626,2,1,11,43,2020-04-15,134103ATB,2.0,2,4,476.0,7616,2021-12-29
1,2020,666668,6859861,1,1,11,43,2020-04-03,146432ASS,20.0,2,3,14.0,1680,2021-12-29
2,2020,666668,769595,1,1,11,43,2020-11-25,451302ATB,0.25,2,7,53.0,186,2021-12-29
3,2020,666668,6859861,4,1,11,43,2020-04-03,185102ACH,1.0,1,3,30.0,90,2021-12-29
4,2020,666668,9307313,2,1,11,43,2020-01-30,260100ACR,1.0,1,3,302.0,906,2021-12-29


# 표본 조사를 하기 위해 sampling

In [6]:
# 가입자 일련번호 고유값 수
# 33만명의 환자 데이터가 있음
raw["가입자 일련번호"].nunique()

333333

## numpy 사용

In [7]:
# 10,000명 샘플림

# 문서에서 권장하는 random generator
# rng = np.random.default_rng(42)

# 기존에 쓰이던 random generator 방식
# np.random.seed(42)

sample_no = np.random.choice(raw["가입자 일련번호"].unique(), 10000)
sample_no

array([852193, 932641, 724519, ..., 811241, 885040, 737506])

In [8]:
# 샘플링한 가입자 일련번호로 전체 데이터 추출
df_temp = raw[raw["가입자 일련번호"].isin(sample_no)]
df_temp.shape

(316049, 15)

In [9]:
df_temp["가입자 일련번호"].nunique()

9849

## pandas 사용

In [10]:
sample_no = raw["가입자 일련번호"].sample(10000, random_state=42)
sample_no

1762599    720717
3591389    776285
6467703    864183
5303579    828872
7208786    886495
            ...  
267310     674961
9148996    945733
7664696    900371
9529452    957477
8173269    915880
Name: 가입자 일련번호, Length: 10000, dtype: int64

In [11]:
df_temp = raw[raw["가입자 일련번호"].isin(sample_no)]
df_temp.shape

(769852, 15)

In [12]:
# 중복된 값이 있을 수 있기 때문에 정확히 10,000개의 데이터는 아님
df_temp["가입자 일련번호"].nunique()

9660

# 샘플링 파일로 저장

In [13]:
df_temp.to_csv("data/HP_2020_sample.csv", index=False)

In [14]:
pd.read_csv("data/HP_2020_sample.csv")

Unnamed: 0,기준년도,가입자 일련번호,처방내역일련번호,일련번호,성별코드,연령대코드(5세단위),시도코드,요양개시일자,약품일반성분명코드,1회 투약량,1일투약량,총투여일수,단가,금액,데이터 공개일자
0,2020,666669,8553063,3,2,9,26,2020-10-11,430102ATB,1.0,2,10,205.0,4100,2021-12-29
1,2020,666669,8553063,2,2,9,26,2020-10-11,438901ATB,1.0,2,10,70.0,1400,2021-12-29
2,2020,666669,8553063,4,2,9,26,2020-10-11,374602ATB,1.0,1,10,765.0,7650,2021-12-29
3,2020,666669,3116013,1,2,9,26,2020-05-02,546600CCM,1.0,1,1,1720.0,1720,2021-12-29
4,2020,666669,2241712,2,2,9,26,2020-05-30,451202ATE,1.0,2,7,41.0,574,2021-12-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769847,2020,999989,1296609,6,2,16,44,2020-09-08,163801ATB,1.0,1,60,30.0,1800,2021-12-29
769848,2020,999989,1296609,5,2,16,44,2020-09-08,454001ATB,1.0,1,60,612.0,36720,2021-12-29
769849,2020,999989,1296609,4,2,16,44,2020-09-08,513000ATB,1.0,1,60,190.0,11400,2021-12-29
769850,2020,999989,1296609,3,2,16,44,2020-09-08,614901ATB,1.0,2,60,152.0,18240,2021-12-29
