In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
from gensim.models import Word2Vec

In [3]:
df_adm_dis_sum=pd.read_csv('df_adm_dis_sum.csv')

In [4]:
df_adm_dis_sum.readmitted = np.where((df_adm_dis_sum.READMISSION_STATUS=='Readmitted'),1,0)

  """Entry point for launching an IPython kernel.


## Apply the word2vec model on the MIMIC III dataset

In [None]:
corpus = []
for i in range(len(df_adm_dis_sum)):
   corpus.append(df_adm_dis_sum.TEXT_AGG[i].split())
# I would like to have a 300-dimentional vocabulary for the word2vec embeddings
word2vec_paragraph_model = gensim.models.Word2Vec(sentences=corpus,vector_size=300)
# Size of vocabulary
len(word2vec_paragraph_model.wv)

In [None]:
word2vec_paragraph_model.save("word2vec_paragraph.model")

## Sequencing, padding and creating vector matrix

### Tokenizer ()

In [None]:
# Check the maximum length of the list of lists of tokens
# max_length=0
# for i in range(len(df_adm_dis_sum)):
#    length=len(df_adm_dis_sum.TEXT_AGG[i])
#    if length>max_length:
#       max_length=length
# max_length

In [5]:
# The maximum length of the text is way too large (nearly 50000), so I set the max_length to 4000
maxlength=4000

from tensorflow.keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer

t=Tokenizer()
t.fit_on_texts(df_adm_dis_sum.TEXT_AGG)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


### Sequencing and padding

In [6]:
sequencing=t.texts_to_sequences(df_adm_dis_sum.TEXT_AGG)

In [7]:
padding=sequence.pad_sequences(sequencing,maxlen=maxlength)

In [8]:
np.save('dis_sum_padding',padding)

### vector matrix

In [11]:
# Create a dictionary for the words trained in the word2vec model with their vectors
word2vec_paragraph_model = Word2Vec.load("word2vec_paragraph.model")
word_vec_dict={}
vocab_index=word2vec_paragraph_model.wv.key_to_index
for key in vocab_index:
  word_vec_dict[key]=word2vec_paragraph_model.wv.get_vector(key)

In [12]:
# Create a vector matrix
vocab_size = len(t.word_index) + 1
embed_dim=300
embed_matrix=np.zeros(shape=(vocab_size,embed_dim),dtype=np.int8) # Create a matrix filled with zeros
for word,i in t.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector

In [13]:
np.save('embed_matrix.npy', embed_matrix)

In [15]:
embed_matrix.shape

(118556, 300)

## Split the dataset to training set (with subsampling) and test set

In [47]:
import keras
# One-hot encoding
y=keras.utils.to_categorical(df_adm_dis_sum.readmitted,2)

In [48]:
# Concatenate the padding array and the y array to the pre_subsampling set
pre_sub=np.concatenate((padding,y),axis=1)

In [52]:
# Split the pre_subsampling training set and the test set
train_pre_sub, test=train_test_split(pre_sub,test_size=0.20,random_state=42)

In [57]:
# Split the pre-subsampling training set by readmitted and non-readmitted
train_readm=[]
train_nreadm=[]

for i in range(len(train_pre_sub)):
    if train_pre_sub[i][-1]==1:
        train_readm.append(train_pre_sub[i])
    else:
        train_nreadm.append(train_pre_sub[i])
        
train_readm=np.array(train_readm)
train_nreadm=np.array(train_nreadm)

In [62]:
# Sub-sample the non-readmitted part and concatenate the two sets into one array
random_indices = np.random.choice(train_nreadm.shape[0], size=len(train_readm), replace=False)
train_sub=np.concatenate((train_readm,train_nreadm[random_indices, :]),axis=0)

In [72]:
X_train=train_sub[:,0:-2]
y_train=train_sub[:,-2:]
X_test=test[:,0:-2]
y_test=test[:,-2:]

In [73]:
np.save('X_train_padding',X_train)
np.save('y_train_padding',y_train)
np.save('X_test_padding',X_test)
np.save('y_test_padding',y_test)