In [5]:
import numpy as np
import pandas as pd

from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, AveragePooling1D, AveragePooling2D,TimeDistributed
from keras.layers import LSTM, SpatialDropout1D, Merge
from keras import backend
np.random.seed(0)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.preprocessing import MultiLabelBinarizer


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#load data
train = pd.read_csv('data/features/features.csv', sep=',', header=0)

#y labels
useful = train["useful"]
cancer_types_raw = train["cancer_types"]
cancer_types_raw = [re.sub(r'[^a-z, ]', '', s.lower()) for s in cancer_types_raw.values]
cancer_types_raw = np.array([s.split(",") for s in cancer_types_raw])

#text features
text = train['abstract']+train['fulltitle']+train['subtitle']
text = text.values.astype(dtype=str)

#only useful examples have a cancer type
idxthere = np.nonzero(train['useful'] == 1)[0]
text = text[idxthere]
cancer_types_raw = cancer_types_raw[idxthere]

#prepare Y
mlb = MultiLabelBinarizer()
cancer_types = mlb.fit_transform(cancer_types_raw)
    

num_labels = len(np.unique(useful))

#text pre-processing
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stop_words.remove('but')
stop_words.remove('not')
stop_words.remove('no')
stemmer = SnowballStemmer('english')

print("pre-processing train docs...")
processed_docs_train = []
for doc in text:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_train.append(stemmed)

dictionary = corpora.Dictionary(processed_docs_train)
dictionary_size = len(dictionary.keys())
print("dictionary size: ", dictionary_size)
#dictionary.save('dictionary.dict')
#corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print("converting to token ids...")
word_id_train, word_id_len = [], []
for doc in processed_docs_train:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_train.append(word_ids)
    word_id_len.append(len(word_ids))

seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)

#pad sequences
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
#y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)
#y_train_enc = (sentiment_train-2)/2 #map down to: [-1,1]

pre-processing train docs...
dictionary size:  126016
converting to token ids...


In [12]:
nb_classes = cancer_types.shape[1]
#LSTM (using text from abstract+title)
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, recurrent_dropout=0.2, dropout=0.2))
model.add(Dense(nb_classes, kernel_initializer="uniform", activation = 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 128)         16130048  
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, None, 128)         0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 87)                11223     
Total params: 16,272,855.0
Trainable params: 16,272,855.0
Non-trainable params: 0.0
_________________________________________________________________


In [13]:
random_idx = np.arange(word_id_train.shape[0])
np.random.shuffle(random_idx)
word_id_train = word_id_train[random_idx,:]
useful = useful[random_idx]

In [None]:
from keras.callbacks import EarlyStopping
print("fitting LSTM ...")
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model.fit(word_id_train, cancer_types, epochs=1, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])
test_pred = model.predict_classes(word_id_test)

fitting LSTM ...
Train on 16870 samples, validate on 4218 samples
Epoch 1/1

In [54]:
test_pred = model.predict(word_id_test)
test_pred_conv = np.around((test_pred*2)+2)

In [63]:
#make a submission
test_df['Sentiment'] = test_pred_conv.astype(int)
header = ['PhraseId', 'Sentiment']
test_df.to_csv('./lstm_sentiment3.csv', columns=header, index=False, header=True)