In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import Pool
from contextlib import closing

In [2]:
test_variants = pd.read_csv('data/test_variants', index_col=0)
print(test_variants.shape)
test_variants.head()

(5668, 2)


Unnamed: 0_level_0,Gene,Variation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ACSL4,R570S
1,NAGLU,P521L
2,PAH,L333F
3,ING1,A148D
4,TMEM216,G77A


In [3]:
training_variants = pd.read_csv('data/training_variants', index_col=0)
print(training_variants.shape)
training_variants.head()

(3321, 3)


Unnamed: 0_level_0,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,FAM58A,Truncating Mutations,1
1,CBL,W802*,2
2,CBL,Q249E,2
3,CBL,N454D,3
4,CBL,L399V,4


In [4]:
df = training_variants.iloc[:, :-1].append(test_variants)
print(df.shape)
df.head()

(8989, 2)


Unnamed: 0_level_0,Gene,Variation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,FAM58A,Truncating Mutations
1,CBL,W802*
2,CBL,Q249E
3,CBL,N454D
4,CBL,L399V


In [5]:
def get_unique_dict(column_):
    uniques = np.unique(column_)
    unique_dict = {}
    for i, uniq in enumerate(uniques):
        unique_dict[uniq] = i
    return unique_dict

In [6]:
gene_dict = get_unique_dict(df['Gene'])
variation_dict = get_unique_dict(df['Variation'])

In [7]:
training_variants['Gene'] = training_variants['Gene'].map(gene_dict)
training_variants['Variation'] = training_variants['Variation'].map(variation_dict)
test_variants['Gene'] = test_variants['Gene'].map(gene_dict)
test_variants['Variation'] = test_variants['Variation'].map(variation_dict)

In [8]:
training_variants.head()

Unnamed: 0_level_0,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,447,7654,1
1,216,8255,2
2,216,5191,2
3,216,4572,3
4,216,3958,4


In [9]:
training_variants.to_csv('data/train_1.csv')
test_variants.to_csv('data/test_1.csv')

In [10]:
import random

In [11]:
l = [i for i in range(10)]
random.shuffle(l)
l

[0, 8, 6, 2, 1, 4, 5, 9, 3, 7]

In [12]:
len(gene_dict)

1507

In [13]:
len(variation_dict)

8609

In [14]:
train_text = pd.read_csv('data/training_text', sep='\|\|', engine='python')
test_text = pd.read_csv('data/test_text', sep='\|\|', engine='python')
train_text.head()

Unnamed: 0,"ID,Text"
0,Cyclin-dependent kinases (CDKs) regulate a var...
1,Abstract Background Non-small cell lung canc...
2,Abstract Background Non-small cell lung canc...
3,Recent evidence has demonstrated that acquired...
4,Oncogenic mutations in the monomeric Casitas B...


In [15]:
test_text.head()

Unnamed: 0,"ID,Text"
0,2. This mutation resulted in a myeloproliferat...
1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,Vascular endothelial growth factor receptor (V...
3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,Abstract Retinoblastoma is a pediatric retina...


In [16]:
df_text = train_text.append(test_text)
df_text.shape

(8989, 1)

In [17]:
import nltk
#nltk.download()
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

In [18]:
import os 
import gensim

def get_word2vec(sentences, location):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=140, window=10, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

Using TensorFlow backend.


In [20]:
w2vec = get_word2vec(MySentences(df_text.iloc[:, 0].values),'word2vec_model')

Found word2vec_model


In [21]:
w2vec_big = get_word2vec(MySentences(df_text.iloc[:, 0].values),'word2vec_model_big')

Found word2vec_model_big


In [22]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [None]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded = mean_embedding_vectorizer.fit_transform(df_text.iloc[:, 0].values)

In [28]:
pd.DataFrame(mean_embedded).to_csv('data/word2vec_small_1.csv')

In [29]:
mean_embedding_vectorizer_big = MeanEmbeddingVectorizer(w2vec_big)
mean_embedded_big = mean_embedding_vectorizer_big.fit_transform(df_text.iloc[:, 0].values)

In [30]:
pd.DataFrame(mean_embedded_big).to_csv('data/word2vec_big_1.csv')