In [71]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.text import Tokenizer

from keras.layers import LSTM
from keras.utils.vis_utils import plot_model as plot
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from keras.models import load_model
import matplotlib.pyplot as plt

%matplotlib inline

In [72]:
!ls ../input

9 _class_real_labels.docx  test_text	  training_text
submissionFile		   test_variants  training_variants


In [73]:
train_df = pd.read_csv('../input/training_variants')
test_df = pd.read_csv('../input/test_variants')
train_text_df = pd.read_csv('../input/training_text', sep='\|\|', engine='python', header=None, skiprows=1, names=['ID', 'Text'], encoding='utf-8')
test_text_df = pd.read_csv('../input/test_text', sep='\|\|', engine='python', header=None, skiprows=1, names=['ID', 'Text'],  encoding='utf-8')

In [75]:
train_df.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [76]:
test_df.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [77]:
train_text_df.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [78]:
train = pd.merge(train_df, train_text_df, on='ID', how='inner')
test = pd.merge(test_df, test_text_df, on='ID', how='inner')

In [79]:
test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [80]:
total = pd.concat([train, test], ignore_index=True)

In [81]:
total.head()

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1.0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2.0,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2.0,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3.0,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4.0,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [82]:
total.tail()

Unnamed: 0,Class,Gene,ID,Text,Variation
8984,,SLC46A1,5663,The realization in the late 1970s that RAS har...,R113S
8985,,FOXC1,5664,Hemizygous deletions are common molecular abno...,L130F
8986,,GSS,5665,All most R267W of has with to SMARTpool invest...,R267W
8987,,CTSK,5666,Abstract Blood samples from 125 unrelated fami...,G79E
8988,,DFNB59,5667,"Loss of DNA mismatch repair (MMR) in humans, m...",T54I


In [83]:
# Replace the Text clolumn with the sentences with those only include the correponding Variation value
# Note: Some variations are represend by the 3-letter code in the text like Y371S (Tyr371Ser)

amino_acids = {'a': 'ala', 'r': 'arg', 'n': 'asn', 'd': 'asp', 'c': 'cys', 'q': 'gln', 'e':'glu', 'g': 'gly',\
               'h': 'his', 'i': 'ile', 'l': 'leu', 'k': 'lys', 'm': 'met', 'f': 'phe', 'p':'pro', 's': 'ser', \
               't': 'thr', 'w': 'trp', 'y': 'tyr', 'v': 'val'}

def convert_three_letters(s):
    """Convert one-letter amino aicd mutations to three-letter: Y371S to tyr371ser
    """
    s = list(s.lower())
    if s[0] in amino_acids and s[-1] in amino_acids:
        s[0] = amino_acids[s[0]]
        s[-1] = amino_acids[s[-1]]
    
    return ''.join(s)

def extract_relevant_sentences(text, variation):
    """Extract relevant sentences from the text
       Relevant sentences: 1. Sentences include the Variation name
                          2.
                          3. If not 1, inlcude the word 'mutation'
                          4. 
                          5. If not 1, 2 and 3, use the full text
       text: input string
       variation: gene variation name
       
       return: a list of relevant sentences
    """
    
    sentenses = []
    text = ''.join([i if ord(i) < 128 else ' ' for i in text])
   
    for s in sent_tokenize(text): 
      
        if variation in s or variation[1:-1] in s or 'mutation' in s.lower() or convert_three_letters(variation) in s.lower():
            sentenses.append(s)
    if len(sentenses) == 0:
        sentenses = sent_tokenize(text) 
    
    return sentenses

relevant_Text = []
#for var, text in zip(total['Variation'], total['Text']):
for var, text in zip(train['Variation'], train['Text']):  
    sentenses = extract_relevant_sentences(text, var)                            
    relevant_Text.append(''.join(sentenses))

In [99]:
#total['relevant_Text'] = pd.Series(relevant_Text)
train['relevant_Text'] = pd.Series(relevant_Text)
train['Class'] = train['Class'] - 1

In [105]:
MAX_NB_WORDS = 5000
MAXLEN = 5000
MAX_FEATURES = 200000
BATCH_SIZE = 32

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train['relevant_Text'])

In [106]:

print('Converting text to sequences...')
sequences_train = tokenizer.texts_to_sequences(train['relevant_Text'])

word_index = tokenizer.word_index

print('Preparing data...')
x = sequence.pad_sequences(sequences_train, maxlen=MAXLEN)
y = np.array(train['Class'])

y_binary = to_categorical(y)

Converting text to sequences...
Preparing data...


In [107]:
X_train, X_test, y_train, y_test = train_test_split(x, y_binary, test_size=0.3)

## LSTM recurrent neural networks model ##

In [103]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

In [108]:
# Create the model

embedding_length = 32
top_words = 500
model = Sequential()
#model.add(Embedding(top_words, embedding_length, input_length=max_length))
model.add(Embedding(MAX_FEATURES, 128))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(9, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'] )
print(model.summary())
model.fit(X_train, y_train, nb_epoch=30, batch_size=64)

#Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print ('Accuracy: %.2f%%' % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 128)         25600000  
_________________________________________________________________
dropout_9 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 9)                 909       
Total params: 25,692,509
Trainable params: 25,692,509
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Ep

In [109]:
from keras.models import model_from_json
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [110]:
#Prepare test data set
relevant_Text = []
#for var, text in zip(total['Variation'], total['Text']):
for var, text in zip(test['Variation'], test['Text']):  
    sentenses = extract_relevant_sentences(text, var)                            
    relevant_Text.append(''.join(sentenses))

In [111]:
test['relevant_Text'] = pd.Series(relevant_Text)

In [112]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(test['relevant_Text'])

In [113]:
print('Converting text to sequences...')
sequences_test = tokenizer.texts_to_sequences(test['relevant_Text'])

word_index = tokenizer.word_index

print('Preparing test data...')
x_pred = sequence.pad_sequences(sequences_test, maxlen=MAXLEN)

Converting text to sequences...
Preparing test data...


In [115]:
# Predict for the test dataset
prediction = model.predict(x_pred)

In [117]:
prediction.shape

(5668, 9)

In [120]:
type(prediction)

numpy.ndarray

In [124]:
sub = pd.DataFrame(prediction, columns=['class1', 'class2', 'class3', 'class4', 'class5', \
                                        'class6', 'class7', 'class8', 'class9'])

sub = pd.concat([test_df['ID'], sub], axis=1)

#sub.head()

sub.to_csv('LSTM_model1.csv', index=False)