In [1]:
import numpy as np
from sklearn.feature_extraction import FeatureHasher
import pandas as pd

# Prepare Training data

In [103]:
local_raw_data = '../data/ordered_ex1_train.csv'

In [104]:
raw_data_df = pd.read_csv(local_raw_data)

In [105]:
# target (percentage of having adverse events) = number_of_sae_subjects / enrollment
raw_data_df['target'] = raw_data_df['number_of_sae_subjects'] / raw_data_df['enrollment']

In [106]:
# drop number_of_sae_subjects and enrollment
raw_data_df.drop(raw_data_df.columns[[7,8]], axis=1, inplace=True)

In [107]:
# convert dataframe to ndarray
raw_data = raw_data_df.as_matrix()

In [108]:
# map has_us_facilities value ('t','f') -> (1, 0)
raw_data[raw_data[:,4] == 't',4]= 1
raw_data[raw_data[:,4] == 'f',4]= 0

## Feature hashing

In [130]:
def merge_category(id_cat):
    output = []
    prev_id = None
    tmp = {}
    for nct_id, cat in id_cat:
        if prev_id is None or prev_id == nct_id:
            if prev_id is None:
                prev_id = nct_id
            tmp[cat] = 1
        else:
            output.append(tmp)
            # reset
            tmp = {}
            tmp[cat] = 1
            prev_id = nct_id
    output.append(tmp)
    print('output number of merged unique ids: {}'.format(len(output)))
    return output

### Convert conditions, interventions and countries to hash feature

In [135]:
unique_ids = np.unique(raw_data[:,0])
number_of_uniqueID = unique_ids.shape[0]
print('number of unique nct_ids: {}'.format(number_of_uniqueID))

number of unique nct_ids: 16677


In [158]:
conditions = raw_data[:,[0,1]]
interventions = raw_data[:,[0,2]]
countries = raw_data[:,[0,5]]

> TODO: draw frequency distribution of the three features

In [164]:
interventions[:,1]

array(['ganciclovir', 'ganciclovir', 'ganciclovir', ..., 'oxycodone',
       'fentanyl', 'adenosine'], dtype=object)

In [159]:
# preprocess the high dimentional features before feed into feature hasher
merged_conditions = merge_category(conditions)
merged_interventions = merge_category(interventions)
merged_countries = merge_category(countries)

output number of merged unique ids: 16677
output number of merged unique ids: 16677
output number of merged unique ids: 16677


In [166]:
number_of_conditions = np.unique(conditions[:,1]).shape[0]
print('number of unique conditions: {}'.format(number_of_conditions))

number_of_interventions = np.unique(interventions[:,1]).shape[0]
print('number of unique interventions: {}'.format(number_of_interventions))

number_of_countries = np.unique(countries[:,1]).shape[0]
print('number of unique countries: {}'.format(number_of_countries))

number of unique conditions: 1909
number of unique interventions: 1846
number of unique countries: 142


In [181]:
# feature hasher
conditions_hasher = FeatureHasher(n_features=int(number_of_conditions * 0.2),
                                                             non_negative=True,input_type='dict')
interventions_hasher = FeatureHasher(n_features=int(number_of_interventions * 0.2),
                                                             non_negative=True,input_type='dict')
countries_hasher = FeatureHasher(n_features=int(number_of_countries),
                                                             non_negative=True,input_type='dict')

In [182]:
# apply feature hashing
conditions_feature = conditions_hasher.fit_transform(merged_conditions).toarray()
print('conditions_feature shape: {}'.format(conditions_feature.shape))

interventions_feature = interventions_hasher.fit_transform(merged_interventions).toarray()
print('interventions_feature shape: {}'.format(interventions_feature.shape))

countries_feature = countries_hasher.fit_transform(merged_countries).toarray()
print('countries_feature shape: {}'.format(countries_feature.shape))

conditions_feature shape: (16677, 381)
interventions_feature shape: (16677, 369)
countries_feature shape: (16677, 142)


In [90]:
np.unique(interventions[:,1]).shape

(1846,)

### Appending the hashed feature to training data

In [193]:
raw_data[0]

array(['NCT00000143', 'hiv infections', 'ganciclovir', 19, 1,
       'United States', 1, 0.0], dtype=object)

In [202]:
# drop the old conditions, interventions and countries
prev_id = None
new_data = []
tmp = []
idx = 0
for data in raw_data:
    cur_id = data[0]
    if prev_id is None or cur_id != prev_id:
        tmp.append(cur_id)
        tmp.append(data[3])
        tmp.append(data[4])
        tmp.append(data[6])
        
        tmp += conditions_feature[idx].tolist()
        tmp += interventions_feature[idx].tolist()
        tmp += countries_feature[idx].tolist()
        
        tmp.append(data[7])
        
        new_data.append(tmp)
        
        # update
        prev_id = cur_id
        tmp = []
        idx += 1

# new data shape: nct_id, number_of_facilities, has_us_facility, 
# number_of_sponsors, conditions_features, interventions_features, contries_features, percentage_of_adverse_event (target)
new_data = np.array(new_data)
print('reconstructed training data shape: {}'.format(new_data.shape))

reconstructed training data shape: (16677, 897)


> note: remove the first nct_id column before send to model

In [206]:
# randomly shuffle the data before categorization
np.random.shuffle(new_data)

In [205]:
train_size = int(new_data.shape[0] * 0.7)
train_features  = new_data[:train_size, :-1]
train_features

array([['NCT00000143', '19', '1', ..., '0.0', '0.0', '0.0'],
       ['NCT00000378', '1', '1', ..., '0.0', '0.0', '0.0'],
       ['NCT00000620', '7', '1', ..., '0.0', '0.0', '0.0'],
       ...,
       ['NCT01295320', '11', '0', ..., '0.0', '0.0', '0.0'],
       ['NCT01295814', '1', '1', ..., '0.0', '0.0', '0.0'],
       ['NCT01295840', '1', '0', ..., '0.0', '0.0', '0.0']], dtype='<U22')

In [208]:
train_size = int(new_data.shape[0] * 0.7)
train_features  = new_data[:train_size, :-1]
train_labels = new_data[:train_size, -1]
print('train_features shape: {}'.format(train_features.shape))

validation_size = int(new_data.shape[0] * 0.2)
validation_features = new_data[train_size:train_size + validation_size, :-1]
validation_labels = new_data[train_size:train_size + validation_size, -1]
print('validation_features shape: {}'.format(validation_features.shape))

test_features = new_data[train_size + validation_size:, :-1]
test_labels = new_data[train_size + validation_size:, -1]
print('test_features shape: {}'.format(test_features.shape))

train_features shape: (11673, 896)
validation_features shape: (3335, 896)
test_features shape: (1669, 896)


## Data Convertion
convert the numpy array into recordIO-protobuf or CSV format that can be used by sagemaker linear_learner
model specs: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html