This notebook will encode the race attribute as a one-hot encooded feature

In [1]:
symptoms_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/data/symptoms_04_16_4.csv"

In [2]:
import numpy as np
import pandas as pd

In [3]:
import os
import json
import pathlib

In [4]:
symptom_db_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/qce/symptom_db.json"
condition_db_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/qce/condition_db.json"

In [5]:
with open(symptom_db_file) as fp:
    symptom_db = json.load(fp)
with open(condition_db_file) as fp:
    condition_db = json.load(fp)

In [6]:
condition_labels = {code: idx for idx, code in enumerate(sorted(condition_db.keys()))}

In [7]:
symptom_map = {code: str(idx) for idx, code in enumerate(sorted(symptom_db.keys()))}

In [8]:
usecols = ['GENDER', 'RACE', 'AGE_BEGIN', 'PATHOLOGY', 'NUM_SYMPTOMS', 'SYMPTOMS']

In [9]:
df = pd.read_csv(symptoms_file, usecols=usecols)

In [10]:
df = df[df.NUM_SYMPTOMS > 0]

In [11]:
df['LABEL'] = df.PATHOLOGY.apply(lambda v: condition_labels.get(v))

In [12]:
df['GENDER'] = df.GENDER.apply(lambda gender: 0 if gender == 'F' else 1)

In [13]:
df = df.rename(columns={'AGE_BEGIN': 'AGE'})

In [14]:
def _symptom_transform(val, labels):
    """
    Val is a string in the form: "symptom_0;symptom_1;...;symptom_n"
    :param val:
    :param labels:
    :return:
    """
    parts = val.split(";")
    res = ",".join([labels.get(item) for item in parts])
    return res

In [15]:
df['SYMPTOMS'] = df.SYMPTOMS.apply(_symptom_transform, labels=symptom_map)

In [16]:
RACE_CODE = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}

# white: 00001
# black: 00010
# asian: 00100
# native: 01000
# other: 10000

In [17]:
def transform_race(race):
    RACE_CODE = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    value = np.zeros(5)
    idx =  RACE_CODE.get(race)
    value[5-idx-1] = 1
    return pd.Series(value, dtype=np.uint8)

In [18]:
# race_df = df.RACE.transform(transform_race)

In [19]:
# def transform_race_df(row):
#     row['RACE_0'] = row['RACE'] == 'other'
#     row['RACE_1'] = row['RACE'] == 'native'
#     row['RACE_2'] = row['RACE'] == 'asian'
#     row['RACE_3'] = row['RACE'] == 'black'
#     row['RACE_4'] = row['RACE'] == 'white'
    
#     return row

In [20]:
# df_with_race = df.transform(transform_race_df, axis=1)
# takes too long
# the idea would be to apply the one-hot encoding at train/test time using the SparseMaker!

In [21]:
from thesislib.utils.ml import models

In [22]:
sparsifier = models.ThesisSymptomRaceSparseMaker(len(symptom_db))

In [23]:
df['RACE'] = df.RACE.apply(lambda v: RACE_CODE.get(v))

In [24]:
ordered_keys = ['LABEL', 'GENDER', 'RACE', 'AGE', 'SYMPTOMS']
df = df[ordered_keys]

In [25]:
data = sparsifier.fit_transform(df)

In [26]:
sparsifier2 = models.ThesisSymptomSparseMaker(len(symptom_db))

In [27]:
data2 = sparsifier2.fit_transform(df)

In [73]:
# this approach, much faster!
labels1 = data[:, 0]
labels2 = data2[:, 0]
labels_diff = np.sum(labels1 != labels2)
label_equal = labels_diff == 0

age1 = data[:, 1]
age2 = data2[:, 3]
age_diff = np.sum(age1 != age2)
age_equal = age_diff == 0

gender1 = data[:, 2]
gender2 = data2[:, 1]
gender_diff = np.sum(gender1 != gender2)
gender_equal = gender_diff == 0

symptoms1 = data[:, 8:]
symptoms2 = data2[:, 4:]
symptoms_diff = np.sum(symptoms1 != symptoms2)
symptoms_equal = symptoms_diff == 0

# need to compare the race, 
race1 = data[:, 3:8]
race1_conv = np.argmax(race1, axis=1)
race2 = data2[:, 2].toarray()
race_equal = np.array_equal(race1_conv, race2)

print("Labels are equal: %d" %label_equal)
print("Age is equal: %d" %age_equal)
print("Gender is equal: %d" %gender_equal)
print("Symptoms are equal: %d" %symptoms_equal)
print("Race is equal: %d" %race_equal)

Labels are equal: 1
Age is equal: 1
Gender is equal: 1
Symptoms are equal: 1
Race is equal: 1


In [74]:
len(symptom_db)

376