In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
import urllib.request

In [27]:
workdir = Path.cwd()
datadir = workdir / 'data'
datadir.mkdir(exist_ok=True, parents=True)

In [65]:
# Assigning variable names to each contributor
var_glob = ['기준년도', '연령대코드(5세단위)']
var_mk = ['총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤']
var_jk = ['구강검진수검여부', '치아우식증유무']
var_yy = ['수축기혈압', '이완기혈압', '식전혈당(공복혈당)']
var_jj = ['혈청크레아티닌']
var_jl = ['혈청지오티(AST)', '혈청지피티(ALT)', '감마지티피']
var_tj = ['체중(5kg단위)', '허리둘레', '청력(좌)', '청력(우)', '시력(좌)', '시력(우)']

# Create a dictionary of variable names
var_dict = {
    'mk': var_glob + var_mk,
    'jk': var_glob + var_jk,
    'yy': var_glob + var_yy,
    'jj': var_glob + var_jj,
    'jl': var_glob + var_jl,
    'tj': var_glob + var_tj,
    'all': var_glob + var_mk + var_jk + var_yy + var_jj + var_jl + var_tj
}

# Create a dictionary to names that need to be changed
names_fix = {
    '연령대 코드(5세단위)': '연령대코드(5세단위)',
    '총 콜레스테롤': '총콜레스테롤',
    'HDL 콜레스테롤': 'HDL콜레스테롤',
    'LDL 콜레스테롤': 'LDL콜레스테롤',
    '구강검진 수검여부': '구강검진수검여부',
    '수축기 혈압': '수축기혈압',
    '이완기 혈압': '이완기혈압',
    '(혈청지오티)AST': '혈청지오티(AST)',
    '(혈청지오티)ALT': '혈청지피티(ALT)',
    '감마 지티피': '감마지티피',
    '체중(5Kg단위)': '체중(5kg단위)',
    '체중(5Kg 단위)': '체중(5kg단위)'
}

In [73]:
# Github info
github_user = 'tjohns94'
github_repo = 'csd230finalproject'
github_branch = 'main'
github_folder = 'data'

# Data years
years_range = range(2013, 2023)

# Generate file names with years
file_names = [f'nhis_data_{year}-{version}.csv' for year in years_range for version in range(1, 3)]

# Initialize nhis_data
nhis_data = pd.DataFrame()

# Download files
for file_name in file_names:
    url = f'https://raw.githubusercontent.com/{github_user}/{github_repo}/{github_branch}/{github_folder}/{file_name}'
    file_path = datadir / file_name
    if file_path.exists():
        print(f'{file_name} already exists')
    else:
        urllib.request.urlretrieve(url, file_path)
        print(f'Downloaded {file_name} to {file_path}')
    # Read data
    temp_data = pd.read_csv(file_path, encoding='cp949', low_memory=False)
    # Rename columns if they are in the names_fix dictionary
    for bad_name, good_name in names_fix.items():
        if bad_name in temp_data.columns:
            temp_data.rename(columns={bad_name: good_name}, inplace=True)
    nhis_data = pd.concat([nhis_data, temp_data], ignore_index=True)

# Fix 2013 age groups
nhis_data[nhis_data['기준년도'] == 2013].loc[:,'연령대코드(5세단위)'] += 4

nhis_data_2013-1.csv already exists
nhis_data_2013-2.csv already exists
nhis_data_2014-1.csv already exists
nhis_data_2014-2.csv already exists
nhis_data_2015-1.csv already exists
nhis_data_2015-2.csv already exists
nhis_data_2016-1.csv already exists
nhis_data_2016-2.csv already exists
nhis_data_2017-1.csv already exists
nhis_data_2017-2.csv already exists
nhis_data_2018-1.csv already exists
nhis_data_2018-2.csv already exists
nhis_data_2019-1.csv already exists
nhis_data_2019-2.csv already exists
nhis_data_2020-1.csv already exists
nhis_data_2020-2.csv already exists
nhis_data_2021-1.csv already exists
nhis_data_2021-2.csv already exists
nhis_data_2022-1.csv already exists
nhis_data_2022-2.csv already exists


In [74]:
# Translate column names to English
kor_to_eng = {
    '기준년도': 'year',
    '연령대코드(5세단위)': 'age_code',
    '총콜레스테롤': 'total_cholesterol',
    '트리글리세라이드': 'triglycerides',
    'HDL콜레스테롤': 'hdl_cholesterol',
    'LDL콜레스테롤': 'ldl_cholesterol',
    '구강검진수검여부': 'oral_exam',
    '치아우식증유무': 'dental_caries',
    '수축기혈압': 'systolic_bp',
    '이완기혈압': 'diastolic_bp',
    '식전혈당(공복혈당)': 'fasting_glucose',
    '혈청크레아티닌': 'serum_creatinine',
    '혈청지오티(AST)': 'serum_got_ast',
    '혈청지피티(ALT)': 'serum_gpt_alt',
    '감마지티피': 'gamma_gtp',
    '체중(5kg단위)': 'weight',
    '허리둘레': 'waist_circum',
    '청력(좌)': 'left_ear',
    '청력(우)': 'right_ear',
    '시력(좌)': 'left_eye',
    '시력(우)': 'right_eye'
}

# Edit dataframe to include only used variables
nhis_data = nhis_data[var_dict['all']]
nhis_data['age_group'] = nhis_data['연령대코드(5세단위)'].apply(lambda x: f'{(x-1)*5}-{x*5-1}')

# Save dataframes to csv files
for name in var_dict.keys():
    try:
        data = nhis_data[var_dict[name]].rename(columns=kor_to_eng).dropna().sort_values(by=['year', 'age_code'])
    except KeyError:
        print(f'No data for {name}')
        continue
    data.to_csv(datadir / f'nhis_data_{name}.csv', index=False, encoding='utf-8-sig')