In [8]:
import numpy as np
import pandas as pd

from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, AveragePooling1D, AveragePooling2D,TimeDistributed
from keras.layers import LSTM, SpatialDropout1D, Merge
from keras import backend
np.random.seed(0)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.preprocessing import MultiLabelBinarizer
import sklearn as sk

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#load data
train = pd.read_csv('data/features/features.csv', sep=',', header=0)

#y labels
useful = train["useful"]
cancer_types_raw = train["cancer_types"]
cancer_types_raw = [re.sub(r'[^a-z, ]', '', s.lower()) for s in cancer_types_raw.values]
cancer_types_raw = np.array([s.split(",") for s in cancer_types_raw])

#text features
text = train['abstract']+train['fulltitle']+train['subtitle']
text = text.values.astype(dtype=str)

#only useful examples have a cancer type
idxthere = np.nonzero(train['useful'] == 1)[0]
text = text[idxthere]
cancer_types_raw = cancer_types_raw[idxthere]

#prepare Y
mlb = MultiLabelBinarizer()
cancer_types = mlb.fit_transform(cancer_types_raw)
    

num_labels = len(np.unique(useful))

#text pre-processing
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stop_words.remove('but')
stop_words.remove('not')
stop_words.remove('no')
stemmer = SnowballStemmer('english')

print("pre-processing train docs...")
processed_docs_train = []
for doc in text:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_train.append(stemmed)

dictionary = corpora.Dictionary(processed_docs_train)
dictionary_size = len(dictionary.keys())
print("dictionary size: ", dictionary_size)
#dictionary.save('dictionary.dict')
#corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print("converting to token ids...")
word_id_set, word_id_len = [], []
for doc in processed_docs_train:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_set.append(word_ids)
    word_id_len.append(len(word_ids))

seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)

#pad sequences
word_id_set = sequence.pad_sequences(np.array(word_id_set), maxlen=seq_len)

#y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)
#y_train_enc = (sentiment_train-2)/2 #map down to: [-1,1]

pre-processing train docs...
dictionary size:  126016
converting to token ids...


In [4]:
nb_classes = cancer_types.shape[1]

#shuffle data set
random_idx = np.arange(word_id_set.shape[0])
np.random.shuffle(random_idx)
word_id_set = word_id_set[random_idx,:]
cancer_types = cancer_types[random_idx]

#split train/test
word_id_test = word_id_set[:1000,]
y_test = cancer_types[:1000,]
word_id_train = word_id_set[1000:,]
y_train = cancer_types[1000:,]

In [29]:
print(word_id_test.shape)
print(word_id_train.shape)

(1000, 334)
(20088, 334)


In [5]:
#LSTM (using text from abstract+title)
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, recurrent_dropout=0.2, dropout=0.2))
model.add(Dense(nb_classes, kernel_initializer="uniform", activation = 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         16130048  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 87)                11223     
Total params: 16,272,855.0
Trainable params: 16,272,855.0
Non-trainable params: 0.0
_________________________________________________________________


In [30]:
from keras.callbacks import EarlyStopping
print("fitting LSTM ...")
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model.fit(word_id_train, y_train, epochs=1, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])


fitting LSTM ...
Train on 16070 samples, validate on 4018 samples
Epoch 1/1


<keras.callbacks.History at 0x7fa6c8ffa3c8>

In [31]:
#preds = model.predict_proba(X_test)[:,1]
#test_pred = model.predict_classes(word_id_test)
#predicted = model.predict_classes(word_id_test)
predicted = model.predict(word_id_test)


In [33]:
print(predicted.shape)
print(predicted[:3])


(1000, 87)
[[ 0.00047806  0.00070874  0.00044131  0.0007472   0.00795426  0.00058558
   0.0015318   0.00052567  0.00045287  0.16891794  0.00037305  0.00043488
   0.00288673  0.00463029  0.02073579  0.01712375  0.0085775   0.01151209
   0.00735951  0.05512116  0.00316391  0.02694018  0.00039511  0.00046615
   0.00837205  0.00041351  0.00135769  0.08239596  0.03574406  0.000507
   0.03991226  0.01171867  0.00034598  0.11833775  0.00155826  0.05685746
   0.09894162  0.00042015  0.00056793  0.03071534  0.0018516   0.00042776
   0.0086377   0.01284018  0.00081857  0.00218752  0.0028321   0.00377838
   0.00055412  0.00039918  0.00442178  0.0012632   0.03136571  0.00036981
   0.0587365   0.05044375  0.00058983  0.00086663  0.03813618  0.05539796
   0.00301589  0.00268666  0.00062401  0.00160329  0.00149038  0.05383369
   0.00044662  0.10374559  0.0004661   0.00035023  0.00039408  0.0026414
   0.00167218  0.01747153  0.00809463  0.00273501  0.00051153  0.00038946
   0.00042459  0.00283815  0.0

In [26]:
word_id_test[7]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [34]:
predicted[predicted>=0.1] = 1
predicted[predicted<0.1] = 0
sk.metrics.accuracy_score(y_test, predicted)

0.001

In [35]:
pred_labels = mlb.inverse_transform(predicted)
true_labels = mlb.inverse_transform(y_test)
#print(true_labels)
for i in range(0,100):
    print(pred_labels[i])
    print(true_labels[i])
    print("################")

('basis', 'lunge', 'prostata')
('lunge',)
################
('basis', 'lunge', 'prostata')
('lunge',)
################
('basis', 'lunge', 'prostata')
('basis',)
################
('basis', 'lunge', 'prostata')
('basis',)
################
('basis', 'lunge', 'prostata')
('magen',)
################
('basis', 'lunge', 'prostata')
('leber',)
################
('basis', 'lunge', 'prostata')
('basis',)
################
('basis', 'lunge', 'prostata')
('basis', 'schilddruese')
################
('basis', 'lunge', 'prostata')
('niere',)
################
('basis', 'lunge', 'prostata')
('lunge', 'nsclc')
################
('basis', 'lunge', 'prostata')
('pankreas',)
################
('basis', 'lunge', 'prostata')
('pankreas',)
################
('basis', 'lunge', 'prostata')
('leber',)
################
('basis', 'lunge', 'prostata')
('lunge', 'nsclc')
################
('basis', 'lunge', 'prostata')
('glioblastom', 'hirn')
################
('basis', 'lunge', 'prostata')
('oesophagus',)
################
(