# Code to extract and treat the dataset

This code is not treating the data in a deep way because this is a preliminary project.
To treat this big data set we would need to train several neural networks to fill in NaN values.
This is big data set for that reason we need to be careful when applying 'usual' methods as filling 
with the median or any other trick as filling with the highest frequency data element of a column.

__Why do we use a neural network?__

Because we have a big dataset. Big dataset, then neural network. In this case we can train a neural network, deeply. The only problem we have is that we don't computational power or resources and for 
that reason we cannot train the neural network using the entire dataset.

If we would have computational time, online, or in a serve, we could train this network
and reach certain precission when prediction probabilities.

In [1]:
# Libraries

import pandas as pd
import numpy as np



In [2]:
# years on a list 

csv_file_list = 11*[0]

j = 0
for i in range(5,16):
    if i < 10: 
       csv_file_list[j] = '200'+str(i)
    else:
       csv_file_list[j] = '20'+str(i)
    j += 1

csv_file_list
 

['2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015']

In [3]:
# Function which extracts the data from all these years

# col_list is the list of variables you want to extract; example -> col_list = ['sex','race']
# csv_file is the list of years -> These are the years of the files we load.

def extract(col_list,csv_file):
    df2 = pd.DataFrame()
    for i in csv_file:
        df1 = pd.read_csv('archive/' + i + "_data.csv", usecols = col_list)
        df2 = pd.concat([df2,df1], axis=1, sort=False)
        print(i)
        del(df1)
    return df2



In [4]:
csv_file_list = ['2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2013',
 '2014',
 '2015']

# 2012 name is 'icd_code_10' and not 'icd_code_10th_revision' we have to be careful 
# when extracting this data set.

# we can also modify the column on the dataset which is easier. 
# I don't know if you will read the code for that reason I will
# work with the original dataset

In [5]:
# Extracting the data

col_list = ['age_recode_52','education_2003_revision','sex','race','marital_status','manner_of_death','entity_condition_2','icd_code_10th_revision']
df = extract(col_list, ['2005'])

for i in ['2006','2007','2008','2009','2010','2011']:
    df.append(extract(col_list, [i]))
    
df.append(extract(['age_recode_52','education_2003_revision','sex','race','marital_status','manner_of_death','entity_condition_2','icd_code_10'],['2012']).rename(columns={'icd_code_10':'icd_code_10th_revision'}))

for i in ['2013','2014','2015']:
    df.append(extract(col_list, [i]))

2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015


In [6]:
# Categorical variables to dummy variables

# categorical: sex, entity_condition_2, entity_condition_4

categorical = ['sex','marital_status']

def intodummy(df,categorical):
    for name in categorical:  
        dummy = pd.get_dummies(df[name])
        df = pd.concat([df,dummy], axis=1)
        del(dummy)
        df = df.drop(columns=[name])
    return df



In [7]:
df = intodummy(df,categorical)



In [8]:
# We extract the data for our target - in this case 

data = df
data

del(df) # free memory

In [9]:
# Extracting the concrete dataset (I251 or C349 or G309)


#I25.1: Atherosclerotic heart disease 
#C34.9: Malignant neoplasm of bronchus or lung, unspecified
#G30.9: Alzheimer's disease, unspecified 

data = data[ (data['icd_code_10th_revision'] == 'I251') | (data['icd_code_10th_revision'] == 'C349' ) | (data['icd_code_10th_revision'] == 'G309' )]

In [10]:
data['manner_of_death'] = data['manner_of_death'].fillna(int(data['manner_of_death'].median()))
data['age_recode_52'] = data['age_recode_52'].fillna(int(data['age_recode_52'].median())) # data with those three conditions
data['education_2003_revision'] = data['education_2003_revision'].fillna(float(int(data['education_2003_revision'].median()))) # data with those three conditions




In [11]:
# Filling with the most frequent values

data.entity_condition_2.value_counts()

21I251    93233
21C349    31373
61F179    19254
21G309    17765
21I500    14101
          ...  
12C919        1
21T58         1
12G932        1
61K560        1
62I701        1
Name: entity_condition_2, Length: 2927, dtype: int64

In [29]:
# dropping entity conditions which appear residually
# list of these entity conditions

# drop some of these values --> these are changing our result 
# and the info contained there is not relevant to train the neural network

data.entity_condition_2.value_counts() < 100

NameError: name 'data' is not defined

In [12]:
data['entity_condition_2'] = data.entity_condition_2.fillna('21I251')

In [13]:
data




Unnamed: 0,education_2003_revision,age_recode_52,manner_of_death,icd_code_10th_revision,entity_condition_2,race,F,M,D,M.1,S,U,W
4,3.0,39,7.0,C349,61F179,1,1,0,0,1,0,0,0
15,3.0,37,7.0,C349,61J459,1,1,0,0,1,0,0,0
22,3.0,38,7.0,C349,21I251,1,1,0,0,1,0,0,0
30,3.0,42,7.0,C349,21I251,1,1,0,0,0,0,0,1
35,3.0,39,7.0,C349,21C349,3,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2452484,3.0,40,7.0,I251,21I251,1,0,1,0,0,0,1,0
2452486,9.0,44,7.0,I251,21I251,2,1,0,0,0,0,1,0
2452490,9.0,39,7.0,I251,21I251,2,0,1,0,0,0,1,0
2452499,9.0,42,7.0,I251,21I251,2,0,1,0,0,0,1,0


In [14]:
data = intodummy(data,['entity_condition_2'])

In [15]:
data = intodummy(data,['icd_code_10th_revision'])




In [16]:
data.reset_index(drop = True)




Unnamed: 0,education_2003_revision,age_recode_52,manner_of_death,race,F,M,D,M.1,S,U,...,62R54,62R568,62R628,62S720,62T08,62T179,62W80,C349,G309,I251
0,3.0,39,7.0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3.0,37,7.0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3.0,38,7.0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3.0,42,7.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3.0,39,7.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428942,3.0,40,7.0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
428943,9.0,44,7.0,2,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
428944,9.0,39,7.0,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
428945,9.0,42,7.0,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


# Classification neural network




In [17]:
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline



In [18]:
Y = pd.concat([data['C349'],data['G309'],data['I251']], axis = 1).values

In [19]:
Y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]], dtype=uint8)

In [20]:
X = data.drop(columns = ['C349','G309','I251'], axis =1)
del(data) # free memory
X

Unnamed: 0,education_2003_revision,age_recode_52,manner_of_death,race,F,M,D,M.1,S,U,...,62R13,62R522,62R53,62R54,62R568,62R628,62S720,62T08,62T179,62W80
4,3.0,39,7.0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
15,3.0,37,7.0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
22,3.0,38,7.0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
30,3.0,42,7.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,3.0,39,7.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2452484,3.0,40,7.0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2452486,9.0,44,7.0,2,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2452490,9.0,39,7.0,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2452499,9.0,42,7.0,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X = X.values[0:10000]

In [22]:
Y = Y[0:10000]

In [23]:
# Classification neural network

import numpy as np
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# 0.25

model = Sequential()
model.add(Dense(5000, input_dim=X.shape[1], activation='relu',kernel_initializer='random_normal'))
model.add(Dense(2500,activation='relu',kernel_initializer='random_normal'))
model.add(Dense(2500,activation='relu',kernel_initializer='random_normal'))
model.add(Dense(1250,activation='relu',kernel_initializer='random_normal'))
model.add(Dense(625,activation='relu',kernel_initializer='random_normal'))
model.add(Dense(Y.shape[1],activation='softmax',kernel_initializer='random_normal'))
model.compile(loss='categorical_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(),metrics =['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto', restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)


Epoch 1/1000
235/235 - 44s - loss: 7.5352 - accuracy: 0.4877 - val_loss: 0.9646 - val_accuracy: 0.5428
Epoch 2/1000
235/235 - 44s - loss: 0.8025 - accuracy: 0.6203 - val_loss: 0.7632 - val_accuracy: 0.6464
Epoch 3/1000
235/235 - 44s - loss: 0.7521 - accuracy: 0.6581 - val_loss: 0.7341 - val_accuracy: 0.6768
Epoch 4/1000
235/235 - 44s - loss: 0.7064 - accuracy: 0.6745 - val_loss: 0.6916 - val_accuracy: 0.6884
Epoch 5/1000
235/235 - 43s - loss: 0.6869 - accuracy: 0.6768 - val_loss: 0.7738 - val_accuracy: 0.6488
Epoch 6/1000
235/235 - 43s - loss: 0.6765 - accuracy: 0.6845 - val_loss: 0.7969 - val_accuracy: 0.6472
Epoch 7/1000
235/235 - 43s - loss: 0.6683 - accuracy: 0.6968 - val_loss: 0.8023 - val_accuracy: 0.6472
Epoch 8/1000
235/235 - 44s - loss: 0.6596 - accuracy: 0.6925 - val_loss: 0.6986 - val_accuracy: 0.6844
Epoch 9/1000
Restoring model weights from the end of the best epoch.
235/235 - 44s - loss: 0.6480 - accuracy: 0.6937 - val_loss: 0.6948 - val_accuracy: 0.6892
Epoch 00009: earl

<tensorflow.python.keras.callbacks.History at 0x7fe2674e3a30>

In [24]:
pred = model.predict(x_test)
pred

array([[0.6769034 , 0.05165191, 0.27144465],
       [0.3703455 , 0.23339689, 0.39625758],
       [0.21183269, 0.4122175 , 0.37594977],
       ...,
       [0.49107948, 0.08906168, 0.41985878],
       [0.678543  , 0.02377607, 0.29768097],
       [0.4579829 , 0.21764201, 0.32437506]], dtype=float32)

In [25]:
(pred > 0.5).astype("int32")

array([[1, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 0, 0],
       [1, 0, 0],
       [0, 0, 0]], dtype=int32)

In [26]:
y_test

array([[0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       ...,
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]], dtype=uint8)

In [27]:
# This code trains a neural network - this is computationally hard (we have a lot of data)
# In other conditions we should save the coefficients of this neural network
# because each execution takes a lot of time
#
# Notice that we can use this neural network to estimate the probability of death
# by certain, concrete, ICD codes (3 of them) for each individidual in a data set (these are predictions)
#
# In our particular project we are going to calculate some of these for the test set
# then we will use the results in our final report.

In [28]:
# Save the results in a txt file

with open('results.txt','wb') as f:
    for line in range(len(pred)):
        np.savetxt(f, pred[line], fmt='%.2f')

with open('y_test.txt','wb') as f:
    for line in range(len(y_test)):
        np.savetxt(f, y_test[line], fmt='%.2f')
        
with open('x_test.txt','wb') as f:
    for line in range(len(x_test)):
        np.savetxt(f, x_test[line], fmt='%.2f')