## IMPORT

In [1]:
import psycopg2 as pg
import pandas.io.sql as psql
import pandas as pd

from os import getenv, listdir
from os.path import isfile, join

from dotenv import load_dotenv
import datetime
from datetime import date

## DB 정보 로딩

In [2]:
load_dotenv(dotenv_path = 'insert_data/db_info.env')

DB_HOST = getenv('DB_HOST', None)
DB_PORT = getenv('DB_PORT', 5432)
DB_USER = getenv('DB_USER', None)
DB_PASS = getenv('DB_PASS', None)
DB_NAME = getenv('DB_NAME', None)

conn_string = f"host='{DB_HOST}' dbname='{DB_NAME}' user='{DB_USER}' password='{DB_PASS}' port={DB_PORT}"
conn = pg.connect(conn_string)

## DATA LOAD

In [3]:
def select_all(table_name: str):
    return eval(f"psql.read_sql('SELECT * FROM {table_name}', conn)")

In [4]:
person_data = select_all(table_name="person") # 환자에 대한 정보
death_data = select_all(table_name="death") # 환자의 사망 정보 
visit_occurrence_data = select_all(table_name="visit_occurrence") # 방문에 대한 정보
condition_occurrence_data = select_all(table_name="condition_occurrence") # 병명에 대한 정보
drug_exposure_data = select_all(table_name="drug_exposure") # 의약품 처방 정보

## 과제 문서에 언급된 column만 선정하여 데이터를 재가공

In [5]:
persons = person_data[["person_id", "gender_concept_id", "ethnicity_concept_id", "year_of_birth"]]
persons.head(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,year_of_birth
0,402435,8532,0,1997
1,1022983,8507,0,1950


In [6]:
death = death_data[["person_id", "death_date"]]
death.head(2)

Unnamed: 0,person_id,death_date
0,1691806,2015-06-02
1,99181,2018-11-04


In [7]:
visit_occurrences = visit_occurrence_data[["person_id", "visit_occurrence_id", "visit_concept_id", "visit_end_date", "visit_start_date"]]
visit_occurrences.head(2)

Unnamed: 0,person_id,visit_occurrence_id,visit_concept_id,visit_end_date,visit_start_date
0,116496,36112943,9202,1962-04-13,1962-04-13
1,116496,36112944,9202,1962-04-24,1962-04-24


In [8]:
condition_occurrences = condition_occurrence_data[["person_id", "visit_occurrence_id", "condition_concept_id", "condition_source_value"]]
condition_occurrences.head(2)

Unnamed: 0,person_id,visit_occurrence_id,condition_concept_id,condition_source_value
0,116496,36112954,0,162864005
1,116496,36112952,0,840544004


In [9]:
drug_exposures = drug_exposure_data[["person_id", "visit_occurrence_id", "drug_concept_id", "drug_source_value"]]
drug_exposures.head(2)

Unnamed: 0,person_id,visit_occurrence_id,drug_concept_id,drug_source_value
0,26922,99499216,19073183,308182
1,2955,9251642,40231925,1049221


### person 데이터와 visit_occurrence 데이터의 새로운 컬럼 생성
- person: 나이 (year_of_birth)
- visit_occurrence: 내원일수 (visit_end_date - visit_start_date)

In [10]:
today = date.today().strftime("%Y")
persons["age"] = int(today) - persons["year_of_birth"] + 1 # 한국은 만나이가 아니므로 1을 더해줌

# year_of_birth 컬럼 제거
if "year_of_birth" in list(persons.columns):
    persons = persons.drop(["year_of_birth"], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  persons["age"] = int(today) - persons["year_of_birth"] + 1 # 한국은 만나이가 아니므로 1을 더해줌


In [11]:
persons.head(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,age
0,402435,8532,0,24
1,1022983,8507,0,71


In [12]:
visit_occurrences["visit"] = pd.to_datetime(visit_occurrences['visit_end_date']) \
                                - pd.to_datetime(visit_occurrences['visit_start_date'])

# visit_end_date, visit_start_date 컬럼 제거
for c in visit_occurrences.columns:
    if c == "visit_end_date" or c == "visit_start_date":
        visit_occurrences = visit_occurrences.drop([c], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_occurrences["visit"] = pd.to_datetime(visit_occurrences['visit_end_date']) \


In [13]:
visit_occurrences.head(3)

Unnamed: 0,person_id,visit_occurrence_id,visit_concept_id,visit
0,116496,36112943,9202,0 days
1,116496,36112944,9202,0 days
2,116496,36112954,9202,0 days


## JOIN DATA
- person_id와 visit_occurrence_id로 join하여 데이터를 하나로 만든다.

In [14]:
print(len(persons), len(death), len(visit_occurrence_data), len(condition_occurrence_data), len(drug_exposure_data))

1000 152 41810 12167 46579


#### person과 death를 조인하면 사망한 사람의 정보를 알 수 있음

In [15]:
new_data = pd.merge(persons, death, how='left', on='person_id')
new_data.tail(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,age,death_date
998,2565313,8532,0,56,
999,277792,8507,0,67,


#### person+death+visit_occurrence

In [16]:
new_data = pd.merge(new_data, visit_occurrences, on='person_id')
new_data.tail(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,age,death_date,visit_occurrence_id,visit_concept_id,visit
41808,277792,8507,0,67,,67292323,9202,0 days
41809,277792,8507,0,67,,67292311,9202,0 days


#### person+death+visit_occurrence+condition_occurrences

In [17]:
new_data = pd.merge(new_data, condition_occurrences, on=['person_id', 'visit_occurrence_id'])
new_data.tail(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,age,death_date,visit_occurrence_id,visit_concept_id,visit,condition_concept_id,condition_source_value
12165,277792,8507,0,67,,67292311,9202,0 days,0,840544004
12166,277792,8507,0,67,,67292311,9202,0 days,37311061,840539006


#### person+death+visit_occurrence+condition_occurrences

In [18]:
new_data = pd.merge(new_data, drug_exposures, on=['person_id', 'visit_occurrence_id'])
new_data.tail(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,age,death_date,visit_occurrence_id,visit_concept_id,visit,condition_concept_id,condition_source_value,drug_concept_id,drug_source_value
8204,2565313,8532,0,56,,41635384,9202,0 days,4294548,75498004,1713671,562251
8205,277792,8507,0,67,,67292308,9202,0 days,260139,10509002,1127433,313782


### COPY DATA

In [19]:
data_copy = new_data.copy()
data_copy.tail(2)

Unnamed: 0,person_id,gender_concept_id,ethnicity_concept_id,age,death_date,visit_occurrence_id,visit_concept_id,visit,condition_concept_id,condition_source_value,drug_concept_id,drug_source_value
8204,2565313,8532,0,56,,41635384,9202,0 days,4294548,75498004,1713671,562251
8205,277792,8507,0,67,,67292308,9202,0 days,260139,10509002,1127433,313782


### NaN value 체크

In [20]:
print(data_copy.isnull().sum())

person_id                    0
gender_concept_id            0
ethnicity_concept_id         0
age                          0
death_date                5436
visit_occurrence_id          0
visit_concept_id             0
visit                        0
condition_concept_id         0
condition_source_value       0
drug_concept_id              0
drug_source_value            0
dtype: int64
