In [1]:
import os, itertools, pickle
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf

from sklearn import impute, preprocessing

RAWDATA_PATH = "./rawData/"

In [2]:
## 1. Import data
info_col = ['address', 'lat', 'lon', 
            "age", "A1", "A2", "A4", "A6", "A13",
            "B1", "C3a", "Q5_1", "Q5_2", "身份证号"]

info_rename_col = {"age":"Age", "A1":"gender", "A2":"Ethnic", "A4":"Household", "A6":"Education",
                   "B1":"Smoking", "C3a":"Drinking", "Q5_1":"Height", "Q5_2":"Weight", "身份证号":"ID_card_number", "A13":"income"}

# raw_data["E3CA_A"].unique()
dis_rename_col = {"E3A":"hypertension","E3B":"diabetes",
                  "E3CA_A":"hyperlipidemia", "E3C_A":"coronary_heart_disease",
                  "E3C2_A": "stroke", "E3C3_A":"rheumatic heart disease", 
                  "E3CB_A":"cor pulmonale", "E3C4_A":"tuberculosis", 
                  "E3C5_A":"chronic Bronchitis or emphysema", "E3C6_A":"asthma", 
                  "E3C7_A":"chronic hepatitis/cirrhosis", "E3C8_A" : "peptic ulcer", 
                  "E3C9_A":"gastroenteritis", "E3C10_A":"gallstones_or_cholecyst",
                  "E3C11_A":"fracture", "E3C12_A":"rheumatoid_arthritis", 
                  "E3CC_A":"chronic infectious arthritis", "E3C13_A":"intervertebral disc disease", 
                  "E3C14_A":"mental illness", "E3C15_A":"neurasthenia", 
                  "E3C16_A":"traumatic brain injury", "E3C17_A" :"malignant_tumor"}
dis_col = list(dis_rename_col.keys())

ap_list = [['PM25_year_2015','PM25_year_2016','PM25_year_2017','PM25_year_2018'],
          ['O3_year_2015','O3_year_2016','O3_year_2017','O3_year_2018'],
          ['NO2_year_2015','NO2_year_2016','NO2_year_2017','NO2_year_2018'],
          ['NDVI_2015_250','NDV_2016_250','NDVI_2017_250','NDVI_2018_250'],
          ['NDVI_2015_500','NDVI_2016_500','NDVI_2017_500','NDVI_2018_500'],
          ['NDVI_2015_1000','NDVI_2016_1000','NDVI_2017_1000','NDVI_2018_1000'],
          ['wj_PM1_2014','wj_PM1_2015','wj_PM1_2016','wj_PM1_2017','wj_PM1_2018'],
          ['wj_PM25_2014','wj_PM25_2015','wj_PM25_2016','wj_PM25_2017','wj_PM25_2018'],
          ['wj_PM10_2014','wj_PM10_2015','wj_PM10_2016','wj_PM10_2017','wj_PM10_2018'],
          ['wj_NO2_2014','wj_NO2_2015','wj_NO2_2016','wj_NO2_2017','wj_NO2_2018'],
          ['wj_O3_2014','wj_O3_2015','wj_O3_2016','wj_O3_2017','wj_O3_2018'],
          ['t2m_mean2013','t2m_mean2014','t2m_mean2015','t2m_mean2016','t2m_mean2017','t2m_mean2018']
         ]
ap_col = list(itertools.chain(*ap_list))

raw_data = pd.read_spss(os.path.join(RAWDATA_PATH, "20230310.sav"), 
                        usecols=info_col+dis_col+ap_col)

# Rename the value of disease columns
dis_val_map = {'无':0, '有':1, '是':1, '否':0}
for col_name in dis_col:
    raw_data[col_name].replace(dis_val_map, inplace=True)

# Remove nan value of disease
raw_data.dropna(axis=0, how="all", subset=dis_col, inplace=True)
# Remove nan value of air pollution
raw_data.dropna(axis=0, how="all", subset=ap_col, inplace=True)
# Remove nan value of address
raw_data = raw_data.drop(index=raw_data[raw_data["address"]==""].index)

# Get province
raw_data["address_short"] = raw_data["address"].str[:3]
# Add province
province_name_map = {
    '重庆市':"重庆市",'重庆渝':"重庆市",'重庆九':"重庆市",
    '四川省':"四川省",'都江堰':"四川省",'崇州市':"四川省",'简阳市':"四川省",'若尔盖':"四川省",
    '贵州省':"贵州省",'贵阳市':"贵州省", 
    '云南省':"云南省",'大理白':"云南省",
    '西藏自':"西藏自治区", '西藏拉':"西藏自治区"
}
raw_data["province_chinese"] = raw_data["address_short"].replace(province_name_map)
# raw_data["province_chinese"].unique()
# Remove other province data
raw_data = raw_data.drop(index=raw_data[raw_data["province_chinese"].isin(['广东省', '浙江省','北京市','广西壮', '广西贵', '广西德', '广西南'])].index)
province_english_map = {'重庆市':"Chongqing", '四川省':"Sichuan", 
                        '贵州省':"Guizhou", '云南省':"Yunnan", 
                        '西藏自治区':"Xizang"}
raw_data["province_english"] = raw_data["province_chinese"].replace(province_english_map)

## 2. Get disease columns sum
dis_class = {
    "Infectious_disease":['tuberculosis', 'gastroenteritis',  'peptic ulcer'], 
    "Immunopathy":['rheumatoid_arthritis', 'chronic infectious arthritis'], 
    "Tumor":['malignant_tumor'], 
    "Endocrine_nutritional_or_metabolic_disease":['diabetes', 'hyperlipidemia'],
    "Mental_disorders":['mental illness', 'neurasthenia'],
    "Circulatory_system_disease":['hypertension', 'coronary_heart_disease', 'stroke', 'rheumatic heart disease', 'cor pulmonale'],
    "Respiratory_diseases":['chronic Bronchitis or emphysema', 'asthma'],
    "Digestive_system_disease":['chronic hepatitis/cirrhosis', 'gallstones_or_cholecyst'],
    "Musculoskeletal_disorders":['intervertebral disc disease'],
    "Injury":['traumatic brain injury',  'fracture']
}
# Rename and compute disease columns
raw_data = raw_data.rename(columns=dis_rename_col)
raw_data = raw_data.rename(columns=info_rename_col)

raw_data = raw_data.fillna({k:0 for k in dis_rename_col.values()})

for key, values in dis_class.items():
    raw_data[key] = raw_data[values].sum(axis=1, skipna=True)
    raw_data.loc[raw_data[key] > 0, key] = 1

raw_data["dis_sum"] = raw_data[list(dis_rename_col.values())].sum(axis=1, skipna=True)

raw_data["has_dis"] = raw_data["dis_sum"].copy()
raw_data.loc[raw_data["has_dis"] > 0, "has_dis"] = 1

raw_data["mul_dis"] = raw_data["dis_sum"].copy()
raw_data.loc[raw_data["mul_dis"] < 2, "mul_dis"] = 0
raw_data.loc[raw_data["mul_dis"] >= 2, "mul_dis"] = 1

two_dis_list = ['hypertension,stroke', 'hypertension,coronary_heart_disease',
                'hypertension,diabetes', 'hypertension,malignant_tumor',
                'hypertension,Mental_disorders', 'stroke,coronary_heart_disease',
                'stroke,diabetes', 'coronary_heart_disease,diabetes',
                'coronary_heart_disease,malignant_tumor', 'gastroenteritis,gallstones_or_cholecyst', 
                'hyperlipidemia,gallstones_or_cholecyst','hypertension,gallstones_or_cholecyst',
                'hypertension,hyperlipidemia']

for item in two_dis_list:
    item = item.split(",")
    raw_data["AND".join(item)] = raw_data[item].sum(axis=1)
    raw_data.loc[raw_data["AND".join(item)]<2, "AND".join(item)] = 0
    raw_data.loc[raw_data["AND".join(item)]==2, "AND".join(item)] = 1

three_dis_list = ['hypertension,stroke,coronary_heart_disease',
 'hypertension,stroke,diabetes',
 'hypertension,stroke,Mental_disorders',
 'hypertension,coronary_heart_disease,diabetes',
 'hypertension,coronary_heart_disease,malignant_tumor',
 'hypertension,coronary_heart_disease,Mental_disorders',
 'hypertension,diabetes,malignant_tumor',
 'hypertension,diabetes,Mental_disorders', 
                  'hyperlipidemia,gastroenteritis,gallstones_or_cholecyst',
                  'hypertension,hyperlipidemia,gallstones_or_cholecyst',
                  'gastroenteritis,gallstones_or_cholecyst,rheumatoid_arthritis']

for item in three_dis_list:
    item = item.split(",")
    raw_data["AND".join(item)] = raw_data[item].sum(axis=1)
    raw_data.loc[raw_data["AND".join(item)]<3, "AND".join(item)] = 0
    raw_data.loc[raw_data["AND".join(item)]==3, "AND".join(item)] = 1

# Get mean of air pollution
ap_d = {
    "PM25":['PM25_year_2015','PM25_year_2016','PM25_year_2017','PM25_year_2018'],
    "O3":['O3_year_2015','O3_year_2016','O3_year_2017','O3_year_2018'],
    "NO2":['NO2_year_2015','NO2_year_2016','NO2_year_2017','NO2_year_2018'],
    "NDVI_250":['NDVI_2015_250','NDV_2016_250','NDVI_2017_250','NDVI_2018_250'],
    "NDVI_500":['NDVI_2015_500','NDVI_2016_500','NDVI_2017_500','NDVI_2018_500'],
    "NDVI_1000":['NDVI_2015_1000','NDVI_2016_1000','NDVI_2017_1000','NDVI_2018_1000'],
    "PM1_wj":['wj_PM1_2014','wj_PM1_2015','wj_PM1_2016','wj_PM1_2017','wj_PM1_2018'],
    "PM25_wj":['wj_PM25_2014','wj_PM25_2015','wj_PM25_2016','wj_PM25_2017','wj_PM25_2018'],
    "PM10_wj":['wj_PM10_2014','wj_PM10_2015','wj_PM10_2016','wj_PM10_2017','wj_PM10_2018'],
    "NO2_wj":['wj_NO2_2014','wj_NO2_2015','wj_NO2_2016','wj_NO2_2017','wj_NO2_2018'],
    "O3_wj":['wj_O3_2014','wj_O3_2015','wj_O3_2016','wj_O3_2017','wj_O3_2018'],
    "t2m":['t2m_mean2013','t2m_mean2014','t2m_mean2015','t2m_mean2016','t2m_mean2017','t2m_mean2018']
}
for key, values in ap_d.items():
    raw_data[key] = raw_data[values].mean(axis=1)
    
raw_data["gender"].replace({'女': 0, '男': 1}, inplace=True)
raw_data["gender"] = raw_data["gender"].astype("category")

raw_data["Ethnic"].replace({'汉族': 0,'藏族': 1,'侗族': 2,'白族': 3,'彝族': 4,'布依族': 5,'苗族': 6}, inplace=True)
raw_data["Ethnic"] = raw_data["Ethnic"].astype("category")
raw_data["Ethnic"] = raw_data["Ethnic"].cat.set_categories([0, 1, 2, 3, 4, 5, 6], ordered=True)

raw_data["Smoking"].replace({'不吸烟': 0, '吸烟(至今吸烟共计超过100支)': 1, '已戒烟(戒烟超过半年及以上)': 2}, inplace=True)
raw_data["Smoking"] = raw_data["Smoking"].astype("category")

raw_data["Drinking"].replace({'否': 0, '是': 1}, inplace=True)
raw_data["Drinking"] = raw_data["Drinking"].astype("category")

raw_data["BMI"] = raw_data["Weight"] / (raw_data["Height"]/100)**2


raw_data["malignant_tumor"].replace({1:2}, inplace=True)
raw_data["chronic hepatitis/cirrhosis"].replace({1:3}, inplace=True)
raw_data["CCI"] = raw_data[dis_rename_col.values()].sum(axis=1)

def getAgeCat(x):
    if x <= 40:
        out_value = 0
    elif x <= 50:
        out_value = 1
    elif x <= 60:
        out_value = 2
    elif x <= 70:
        out_value = 3
    else:
        out_value = 4
    return out_value

raw_data["age_cat"] = raw_data["Age"].map(getAgeCat)

raw_data["CCI_age"] = raw_data["age_cat"] + raw_data["CCI"]

In [3]:
with open(os.path.join(RAWDATA_PATH, "analysis_data_no_scale.pickle"), "wb") as f:
    pickle.dump(raw_data, f)