/
process.py
130 lines (108 loc) · 5.05 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
Reads and cleans data
'''
import json
import pandas as pd
import numpy as np
DIAGNOSIS_PATH = 'data/icd_codes.json'
OUTPUT_FNAME = 'cleaned.csv'
COL_NAMES = ['AGE', 'AGE_CAT', 'SEX', 'PREGNANT', 'RACE_ETHNICITY',
'TOBACCO', 'INJURY', 'VISIT_REASON_1', 'VISIT_REASON_2',
'VISIT_REASON_3', 'VISIT_REASON_CAT', 'DIAGNOSIS_LONG_1',
'DIAGNOSIS_LONG_2', 'DIAGNOSIS_LONG_3', 'DIAGNOSIS_SHORT_1',
'DIAGNOSIS_SHORT_2', 'DIAGNOSIS_SHORT_3', 'ARTHRITIS',
'ASTHMA', 'CANCER', 'CEREBROVASCULAR_DIS', 'COPD',
'CHRONIC_RENAL_FAIL', 'CONGESTIVE_HEART_FAIL', 'DEPRESSION',
'DIABETES', 'HYPERLIPIDEMIA', 'HYPERTENSION',
'ISCHEMIC_HEART_DIS', 'OBESITY', 'OSTEOPOROSIS',
'NO_CONDITIONS', 'NUM_CONDITIONS', 'HEIGHT_INCHES',
'WEIGHT_POUNDS', 'TEMP_FAHRENHEIT', 'REGION',
'CENSUS_DIVISION', 'STATE']
REPLACEMENT_DICT = {'AGE': {"92 years or older": "92", "Under 1 year": "0"},
'TOBACCO': {"3": np.nan},
'INJURY': {"2": np.nan},
'VISIT_REASON_CAT': {"6": "Preventive care"},
'HEIGHT_INCHES': {"72 inches (capped value for females)":
"72", "77 inches (capped value for\
males)": "77"},
'WEIGHT_POUNDS': {"350 lbs. or more": "350"},
'STATE': {"71": np.nan, "72": np.nan, "73": np.nan,
"74": np.nan, "96": "ESC_Div_Remainder",
"97": "WSC_Div_Remainder"}}
BAD_DIAGNOSES = ["V990", "V990-", "V991", "V992", "V997", "-9", "V99", "V97",
np.nan]
BAD_INJURY = ["Yes"]
BAD_SYMPTOMS = ["Blank", "visit", "medication", "counseling", "injury",
"surgery"]
VISIT_REASON_COL = 'VISIT_REASON_1'
DIAGNOSIS_COL = 'DIAGNOSIS_SHORT_1'
LONG_NAMES = ['VISIT_REASON_1', 'SYMP1', 'SYMP2', 'SYMP3', 'SYMP4']
def get_diagnosis_map():
'''
Reads in the diagnosis map from json file.
Output:
Returns a dictionary matching ICD codes to long-form diagnosis strings
'''
with open(DIAGNOSIS_PATH, "r") as read_file:
diagnoses = json.load(read_file)
return diagnoses
def read_and_process_data(filename):
'''
Reads the raw input csv data, performs cleaning, filling, and replacement
tasks, exports a csv file of cleaned data, and returns the cleaned df
Input:
filename (str): csv to be processed
Output:
Returns a dictionary matching ICD codes to long-form strings
'''
df = pd.read_csv(filename, header=0, names=COL_NAMES, dtype=str)
df.query("DIAGNOSIS_LONG_1 not in @BAD_DIAGNOSES", inplace=True)
df.query("DIAGNOSIS_SHORT_1 not in @BAD_DIAGNOSES", inplace=True)
df.query("VISIT_REASON_1 not in @BAD_SYMPTOMS", inplace=True)
df.query("INJURY not in @BAD_INJURY", inplace=True)
df = df[~df["VISIT_REASON_1"].str.contains('examination')]
df.fillna(np.nan, inplace=True)
df.replace({"-9": np.nan, "Blank": np.nan}, inplace=True)
df.replace(REPLACEMENT_DICT, inplace=True)
dm = get_diagnosis_map()
df.loc[:, 'DIAGNOSIS_SHORT_1'] = df['DIAGNOSIS_SHORT_1'].apply(
lambda x: str(x).strip('-'))
df.loc[:, 'DIAGNOSIS_SHORT_1'] = df['DIAGNOSIS_SHORT_1'].apply(
lambda x: dm[x] if x in dm else '')
sorted_df = go_long(df)
df = pd.merge(df, sorted_df, on=VISIT_REASON_COL, how='left')
df.loc[:, VISIT_REASON_COL] = df[VISIT_REASON_COL].str.lower()
df.loc[:, DIAGNOSIS_COL] = df[DIAGNOSIS_COL].str.lower()
df.to_csv(OUTPUT_FNAME, index=False)
return df
def go_long(df):
'''
Splits the symptom string into individual words, expands the
dataframe so that each word in a symptom string is its own column, and
finally, unpivots the dataframe from wide format to long format so
that we have two columns: (1) the full symtom string (2)a word that was
in the corresponding symptom string. This is necessary so that we can
hotcode every possible symptom.
Input:
A dataframe
Output:
The long dataframe with two columns: the symptom string and each
symptom found in the string
'''
new = df[VISIT_REASON_COL].str.split(',', expand=True)
new_df = df[VISIT_REASON_COL].to_frame().join(new)
col_names = ['SYMP' + str(i) for i in range(1, len(new_df.columns))]
long_names = [VISIT_REASON_COL] + col_names
new_df.columns = long_names
keys = [c for c in new_df if c.startswith('SYMP')]
melted_df = pd.melt(new_df, id_vars=VISIT_REASON_COL, value_vars=keys,
value_name='KEY')
no_dupes = melted_df.drop_duplicates()
sorted_df = no_dupes.sort_values([VISIT_REASON_COL, 'variable'])
sorted_df.drop('variable', axis=1, inplace=True)
sorted_df.dropna(inplace=True)
sorted_df = sorted_df[~sorted_df['KEY'].str.contains('...', regex=False)]
sorted_df.loc[:, 'KEY'] = sorted_df['KEY'].apply(
lambda x: str(x).strip())
sorted_df.loc[:, 'KEY'] = sorted_df['KEY'].str.lower()
return sorted_df