In [1]:
from __future__ import print_function
import os
import re
import tqdm
import string
import pandas as pd
import numpy as np
import keras


Using TensorFlow backend.


In [2]:
# Read Data
train_variant = pd.read_csv("../../Downloads/Dataset/keras/training_variants")
test_variant = pd.read_csv("../../Downloads/Dataset/keras/test_variants")
train_text = pd.read_csv("../../Downloads/Dataset/keras/training_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
test_text = pd.read_csv("../../Downloads/Dataset/keras/test_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [3]:
# Create train dataset
train = pd.merge(train_variant, train_text, how='left', on='ID')
train_y = train['Class'].values
train_x = train.drop('Class', axis=1)
train_size=len(train_x)

In [4]:
# Create test dataset
test_x = pd.merge(test_variant, test_text, how='left', on='ID')
test_size=len(test_x)

In [5]:
test_index = test_x['ID'].values

In [6]:
# all dataset
all_data = np.concatenate((train_x, test_x), axis=0)
all_data = pd.DataFrame(all_data)
all_data.columns = ["ID", "Gene", "Variation", "Text"]
all_data.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [7]:
# Data Preprocessing
from nltk.corpus import stopwords
from gensim.models.doc2vec import LabeledSentence
from gensim import utils

In [8]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

def textClean(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", str(text))
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    return(text)
def cleanup(text):
    text = textClean(text)
    text= text.translate(str.maketrans("","", string.punctuation))
    return text

In [9]:
allText = all_data['Text'].apply(cleanup)

In [10]:
sentences = constructLabeledSentences(allText)


  after removing the cwd from sys.path.


In [11]:
# Data Preparation and Features Extraction
# 3.1 Text Featurizer using Doc2Vec
from gensim.models import Doc2Vec


In [12]:
Text_INPUT_DIM=300
text_model=None
filename='docEmbeddings_5_clean.d2v'
if os.path.isfile(filename):
    text_model = Doc2Vec.load(filename)
else:
    text_model = Doc2Vec(min_count=1, window=5, size=Text_INPUT_DIM, 
                         sample=1e-4, negative=5, workers=4, iter=5,seed=1)
    test = text_model.build_vocab(sentences)
    test = text_model.train(sentences, total_examples=text_model.corpus_count, epochs=text_model.iter)
    text_model.save(filename)

In [13]:
text_train_arrays = np.zeros((train_size, Text_INPUT_DIM))
text_test_arrays = np.zeros((test_size, Text_INPUT_DIM))


In [14]:
for i in range(train_size):
    text_train_arrays[i] = text_model.docvecs['Text_'+str(i)]
j=0
for i in range(train_size,train_size+test_size):
    text_test_arrays[j] = text_model.docvecs['Text_'+str(i)]
    j=j+1
 

In [15]:
from sklearn.decomposition import TruncatedSVD
Gene_INPUT_DIM=25

svd = TruncatedSVD(n_components=25, n_iter=Gene_INPUT_DIM, random_state=12)

In [16]:
one_hot_gene = pd.get_dummies(all_data['Gene'])
truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)

In [17]:
one_hot_variation = pd.get_dummies(all_data['Variation'])
truncated_one_hot_variation = svd.fit_transform(one_hot_variation.values)

In [18]:
# Output class encoding
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
test = label_encoder.fit(train_y)
encoded_y = np_utils.to_categorical((label_encoder.transform(train_y)))
print(encoded_y[0])


[1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [19]:
# Merge input features
train_set=np.hstack((truncated_one_hot_gene[:train_size],truncated_one_hot_variation[:train_size],text_train_arrays))
test_set=np.hstack((truncated_one_hot_gene[train_size:],truncated_one_hot_variation[train_size:],text_test_arrays))

In [20]:
# Define Keras Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector
from keras.optimizers import SGD


def baseline_model():
    model = Sequential()
    model.add(Dense(256, input_dim=Text_INPUT_DIM+Gene_INPUT_DIM*2, init='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256, init='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(80, init='normal', activation='relu'))
    model.add(Dense(9, init='normal', activation="softmax"))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)  
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [21]:
model = baseline_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               89856     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 80)                20560     
_________________________________________________________________
dense_4 (Dense)              (None, 9)                 729       
Total params: 176,937
Trainable params: 176,937
Non-trainable params: 0
_________________________________________________________________


  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  


In [22]:
estimator=model.fit(train_set, encoded_y, validation_split=0.1, epochs=10, batch_size=64)

Train on 2988 samples, validate on 333 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
print("Training accuracy: %.2f%% / Validation accuracy: %.2f%%" % (100*estimator.history['acc'][-1], 100*estimator.history['val_acc'][-1]))

Training accuracy: 58.40% / Validation accuracy: 43.24%


In [24]:
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(estimator.history['acc'])
plt.plot(estimator.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(estimator.history['loss'])
plt.plot(estimator.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [25]:
y_pred = model.predict_proba(test_set)


In [26]:
submission = pd.DataFrame(y_pred)
submission['id'] = test_index
submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
submission.to_csv("submission_all.csv",index=False)
submission.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,id
0,0.000295,0.261268,0.002418,0.000438,0.0048,0.00887,0.721011,0.000435,0.000465,0
1,0.570053,0.005565,0.005816,0.366974,0.020208,0.022675,0.006223,0.000852,0.001634,1
2,0.146819,0.129763,0.034452,0.046771,0.085754,0.16532,0.367892,0.00985,0.013379,2
3,0.000796,0.436532,0.001154,0.000878,0.001296,0.001803,0.556915,0.000345,0.000282,3
4,0.17294,0.00196,0.003292,0.800294,0.006196,0.013352,0.001652,0.0001,0.000214,4
