In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import urllib.request

In [2]:
workdir = Path.cwd()
datadir = workdir / 'data'
datadir.mkdir(exist_ok=True, parents=True)

In [3]:
class GithubDownloader:
    def __init__(self, _user, _repo, _branch, _folder, _savepath=datadir):
        self.user = _user
        self.repo = _repo
        self.branch = _branch
        self.folder = _folder
        self._savepath = _savepath
        self.files = self.list_files()
        self._savepath.mkdir(exist_ok=True, parents=True)

    # Create a method to check for presence of file in savepath
    def check(self, filename):
        return (self._savepath / filename).exists()
    
    # Create a method to list all files in the folder
    def list_files(self):
        url = f'https://api.github.com/repos/{self.user}/{self.repo}/contents/{self.folder}?ref={self.branch}'
        df = pd.read_json(url)
        return df['name']
    
    # Create a method to download file
    def download(self, filename):
        # Check if file is already downloaded
        if self.check(filename):
            print(f'{filename} already exists')
            return
        # Download the file
        url=f'https://raw.githubusercontent.com/{self.user}/{self.repo}/{self.branch}/{self.folder}/{filename}'
        urllib.request.urlretrieve(url, self._savepath / filename)
        print(f'{filename} downloaded')
        return
    
    # Create a method to download all files
    def download_all(self):
        for file in self.files:
            self.download(file)
        return
    
    # Create a string representation of the class with the url of the github repo
    def __str__(self):
        url = f'https://github.com/{self.user}/{self.repo}'
        ret_str = f"""URL: {url}\nBranch: {self.branch}\nFolder: {self.folder}\
        \nFiles: {self.files}"""
        return ret_str

In [4]:
# Create an instance of the class
# Github info
github_user = 'tjohns94'
github_repo = 'csd230finalproject'
github_branch = 'main'
github_folder = 'data'

gd = GithubDownloader(github_user, github_repo, github_branch, github_folder)

print(gd)

URL: https://github.com/tjohns94/csd230finalproject
Branch: main
Folder: data        
Files: 0     nhis_data_2013-1.csv
1     nhis_data_2013-2.csv
2     nhis_data_2014-1.csv
3     nhis_data_2014-2.csv
4     nhis_data_2015-1.csv
5     nhis_data_2015-2.csv
6     nhis_data_2016-1.csv
7     nhis_data_2016-2.csv
8     nhis_data_2017-1.csv
9     nhis_data_2017-2.csv
10    nhis_data_2018-1.csv
11    nhis_data_2018-2.csv
12    nhis_data_2019-1.csv
13    nhis_data_2019-2.csv
14    nhis_data_2020-1.csv
15    nhis_data_2020-2.csv
16    nhis_data_2021-1.csv
17    nhis_data_2021-2.csv
18    nhis_data_2022-1.csv
19    nhis_data_2022-2.csv
Name: name, dtype: object


In [5]:
# Assigning variable names to each contributor
var_glob = ['기준년도', '연령대코드(5세단위)']
var_mk = ['총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤']
var_jk = ['구강검진수검여부', '치아우식증유무']
var_yy = ['수축기혈압', '이완기혈압', '식전혈당(공복혈당)']
var_jj = ['혈청크레아티닌']
var_jl = ['혈청지오티(AST)', '혈청지피티(ALT)', '감마지티피']
var_tj = ['체중(5kg단위)', '허리둘레', '청력(좌)', '청력(우)', '시력(좌)', '시력(우)']

# Create a dictionary of variable names
var_dict = {
    'mk': var_glob + var_mk,
    'jk': var_glob + var_jk,
    'yy': var_glob + var_yy,
    'jj': var_glob + var_jj,
    'jl': var_glob + var_jl,
    'tj': var_glob + var_tj,
    'all': var_glob + var_mk + var_jk + var_yy + var_jj + var_jl + var_tj
}

# Create a dictionary to names that need to be changed
names_fix = {
    '연령대 코드(5세단위)': '연령대코드(5세단위)',
    '총 콜레스테롤': '총콜레스테롤',
    'HDL 콜레스테롤': 'HDL콜레스테롤',
    'LDL 콜레스테롤': 'LDL콜레스테롤',
    '구강검진 수검여부': '구강검진수검여부',
    '수축기 혈압': '수축기혈압',
    '이완기 혈압': '이완기혈압',
    '(혈청지오티)AST': '혈청지오티(AST)',
    '(혈청지오티)ALT': '혈청지피티(ALT)',
    '감마 지티피': '감마지티피',
    '체중(5Kg단위)': '체중(5kg단위)',
    '체중(5Kg 단위)': '체중(5kg단위)'
}

In [6]:
# Initialize nhis_data
nhis_data = pd.DataFrame()

# Read all files and concatenate them
for file in gd.files:
    # Read data
    temp_data = pd.read_csv(gd._savepath / file, encoding='cp949', low_memory=False)
    # Rename columns if they are in the names_fix dictionary
    for bad_name, good_name in names_fix.items():
        if bad_name in temp_data.columns:
            temp_data.rename(columns={bad_name: good_name}, inplace=True)
    nhis_data = pd.concat([nhis_data, temp_data], ignore_index=True)

# Fix 2013 age groups
nhis_data[nhis_data['기준년도'] == 2013].loc[:,'연령대코드(5세단위)'] += 4

In [7]:
# Translate column names to English
kor_to_eng = {
    '기준년도': 'year',
    '연령대코드(5세단위)': 'age_code',
    '총콜레스테롤': 'total_cholesterol',
    '트리글리세라이드': 'triglycerides',
    'HDL콜레스테롤': 'hdl_cholesterol',
    'LDL콜레스테롤': 'ldl_cholesterol',
    '구강검진수검여부': 'oral_exam',
    '치아우식증유무': 'dental_caries',
    '수축기혈압': 'systolic_bp',
    '이완기혈압': 'diastolic_bp',
    '식전혈당(공복혈당)': 'fasting_glucose',
    '혈청크레아티닌': 'serum_creatinine',
    '혈청지오티(AST)': 'serum_got_ast',
    '혈청지피티(ALT)': 'serum_gpt_alt',
    '감마지티피': 'gamma_gtp',
    '체중(5kg단위)': 'weight',
    '허리둘레': 'waist_circum',
    '청력(좌)': 'left_ear',
    '청력(우)': 'right_ear',
    '시력(좌)': 'left_eye',
    '시력(우)': 'right_eye'
}

# Edit dataframe to include only used variables
nhis_data = nhis_data[var_dict['all']]
nhis_data['age_group'] = nhis_data['연령대코드(5세단위)'].apply(lambda x: f'{(x-1)*5}-{x*5-1}')


# Check if files to be saved already exist
for name in [*var_dict.keys()]:
    if (datadir / f'nhis_data_{name}.csv').exists():
        print(f'nhis_data_{name}.csv already exists')
        var_dict.pop(name)

# Save dataframes to csv files
try:
    for name in var_dict.keys():
        try:
            data = nhis_data[var_dict[name]].rename(columns=kor_to_eng).sort_values(by=['year', 'age_code'])
        except KeyError:
            print(f'No data for {name}')
            continue
        data.to_csv(datadir / f'nhis_data_{name}.csv', index=False, encoding='utf-8-sig')
except Exception as e:
    print(e)

nhis_data_mk.csv already exists
nhis_data_jk.csv already exists
nhis_data_yy.csv already exists
nhis_data_jj.csv already exists
nhis_data_jl.csv already exists
nhis_data_tj.csv already exists
nhis_data_all.csv already exists
