In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from itertools import cycle, islice
from operator import itemgetter
import sif_embedding_wrapper
import pandas as pd
import numpy as np
import itertools
import codecs
import utils
import os

In [3]:
from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('/home/stirunag/models/model_OTAR_200d-3mc-10it.bin', binary=True)
# model.save_word2vec_format('/home/stirunag/pre-trained_word_embeddings/model_OTAR_200d-3mc-10it.txt', binary=False)

# sed -i -n -e '2,$p'file.txt to remove first line from the txt file


words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/model_OTAR_200d-3mc-10it.txt", 
                                                     '/home/stirunag/pre-trained_word_embeddings/OTAR/TF.txt')


# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-FS.txt", 
#                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.300d.txt", 
#                                                       '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

In [4]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'other'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'other'



In [5]:
df = EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)
df.reset_index(inplace=True)

doc_embeddings = sif_embedding_wrapper.sentences2vecs(df["SENTENCE"], embs, words, weight4ind)
df["vector"] = pd.Series(list(doc_embeddings))

In [6]:
ground_truth = {}

for idx, row in df.iterrows():
    ground_truth[idx] = row['CLASS']

categories = list(df["CLASS"].unique())
categories    

['other', 'gene_disease']

In [7]:
# Use LDA to get the topics and assign to class to find top performaning sentences

min_text_length=80
max_iter=150
batch_size=512
learning_offset=300.
n_topics = len(categories)


docs = df

unclassifiable = list(docs[docs["SENTENCE"].map(len) < min_text_length].index)
filtered = docs[~docs.index.isin(unclassifiable)]
ids = [d for d in list(filtered.index)[0:10]]

n_features = 50000
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=0.1,
    max_features=n_features)
tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'SENTENCE']))

lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    batch_size=batch_size,
    learning_method='online',
    learning_offset=learning_offset,
    random_state=0)

lda.fit(tf)
doc_topics = lda.transform(tf)

In [8]:
topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
doc_id = filtered.index

for idx, probs in enumerate(doc_topics):
    score = max(probs)
    topic = np.argmax(probs)
    doc_id = filtered.index[idx]
    topic_leaders["topic_{}".format(topic)].append({"doc_id": doc_id, "score": score})

for i in iter(range(n_topics)):
    topic_leaders["topic_{}".format(i)] = sorted(
        topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)
        
    

# topic_leaders

In [9]:
# select only those sentences which have score more than 65%

sentences = {c:[] for c in categories} 
selected_sentences = {c:[] for c in categories}
sentences_with_score = {c:[] for c in categories}


for each_topic in topic_leaders:
    for each_doc in topic_leaders[each_topic]:
        gt = ground_truth[each_doc['doc_id']]
        sentences[gt].append(each_doc['doc_id'])
        sentences_with_score[gt].append(each_doc['score'])
#         print(each_doc['score'])
        if each_doc['score']>0.66:
             selected_sentences[gt].append(each_doc['doc_id'])
    
# selected_sentences

In [10]:
# Get average/mean of the sentence vectors that represent our topics 
category_vecs = {}
for c in categories:
    vectors = np.asarray(list(df.loc[df.index.isin(selected_sentences[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

    
category_vecs    

{'gene_disease': array([  2.26870566e-01,   1.59295737e-01,   1.67848273e-01,
          1.68088293e-01,   3.19782366e-01,  -2.61122644e-01,
         -4.10503985e-01,   7.45768469e-02,   4.75212000e-01,
          4.99350315e-04,   2.83911673e-02,   2.02528075e-01,
         -3.10628164e-01,   4.83273800e-01,  -8.22246937e-03,
         -1.17849019e-01,   2.77768774e-01,   3.05515428e-01,
          6.31747180e-01,   3.05047933e-01,  -1.56340096e-01,
         -1.37995032e-01,  -2.65361064e-01,  -1.87110943e-01,
         -8.90708888e-02,   4.54369474e-01,   1.11236949e-01,
          1.96965794e-01,   6.18862159e-01,   2.33794377e-01,
          8.15756975e-01,   3.87161307e-01,   5.33904477e-02,
         -1.00010579e-01,  -8.28008654e-02,  -2.30715544e-01,
         -2.39405975e-01,  -6.26783999e-01,   9.47008441e-02,
         -1.43448400e-01,   1.89181505e-02,  -2.56588831e-01,
         -1.66810646e-01,  -1.57205745e-01,  -3.76705362e-01,
         -1.91883665e-01,  -2.08621651e-01,  -1.164049

In [11]:
# Try to predict the label of unknown sentences

predictions = {}

selected_idx = [j for i in selected_sentences.values() for j in i]

for idx, row in df.iterrows():
    if idx in selected_idx:
        max_sim = 0
        winner = 'other'
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[idx] = winner
    

In [12]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_pred.append(str(v))
            mis_classified.append(k)

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified_df, miss_pred = get_accuracy_score(predictions, ground_truth)
score

0.87189278783481672

In [13]:
pd.set_option('display.max_colwidth', -1)

miss_calssified_df = df.iloc[miss_classified_df , [12,13]] 
miss_calssified_df['Predicted-CLASS'] = miss_pred
# miss_calssified_df

result_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Results'))+'/'
miss_calssified_df.to_csv(result_path+'miss_predictions_train.csv')

In [14]:
# Generalisation on the Unseen Dataset GAD


doc_t_d = codecs.open(data_path+'GAD_Corpus_IBIgroup/'+'GAD_Y_N'+'.csv','rU','UTF-8',errors='ignore') 
GAD_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
GAD_target_disease['CLASS'] = 'gene_disease'


# GAD_target_disease.head(10)
doc_embeddings = sif_embedding_wrapper.sentences2vecs(GAD_target_disease["GAD_CONCLUSION"], embs, words, weight4ind)
GAD_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [16]:
GAD_test_ground_truth = {}

for idx, row in GAD_target_disease.iterrows():
    GAD_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

GAD_test_predictions = {}

for idx, row in GAD_target_disease.iterrows():
    max_sim = 0.00
    winner = 'other'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    GAD_test_predictions[idx] = winner   
    
    
GAD_score, GAD_miss_classified, GAD_miss_pred = get_accuracy_score(GAD_test_predictions, GAD_test_ground_truth)
GAD_score    

  'recall', 'true', average, warn_for)


0.99081246622230235

In [None]:
miss_calssified_GAD_test = GAD_target_disease.iloc[GAD_miss_classified , [10,11]] 
miss_calssified_GAD_test['Predicted-CLASS'] = GAD_miss_pred
# miss_calssified_df_test
miss_calssified_GAD_test.to_csv(result_path+'miss_predictions_test.csv')

In [21]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]

sim    
    

{'gene_disease': 0.38856808612462623, 'other': -0.11055105143789226}

In [22]:
# Testing on the 30 papers dataset


import json

with open(data_path+'EBI Standard/'+'rel_data_bronze.json') as json_file:
    data = json.load(json_file)

EBI_standard = []

for each_point in data:
    for each_sent in data[each_point]:
        EBI_standard.append({'SENTENCE': each_sent['sent'], 'CLASS': each_sent['rel']})    
        
EBI_standard_temp = pd.DataFrame(EBI_standard)   

EBI_standard_temp['CLASS'] = EBI_standard_temp['CLASS'].apply(lambda x: x.replace('YGD', 'gene_disease').replace('NGD', 'other').replace('AMB', 'other'))
EBI_standard_temp.to_csv(data_path+'EBI_bronze_standard.csv')

In [24]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(EBI_standard_temp["SENTENCE"], embs, words, weight4ind)
EBI_standard_temp["vector"] = pd.Series(list(doc_embeddings))


EBI_test_ground_truth = {}

for idx, row in EBI_standard_temp.iterrows():
    EBI_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

EBI_test_predictions = {}

for idx, row in EBI_standard_temp.iterrows():
    max_sim = 0.00
    winner = 'other'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    EBI_test_predictions[idx] = winner    
    

EBI_score, EBI_miss_classified, EBI_miss_pred = get_accuracy_score(EBI_test_predictions, EBI_test_ground_truth)
EBI_score    

0.70855977757182564

In [None]:
miss_calssified_EBI_test = EBI_standard_temp.iloc[EBI_miss_classified , [0,1]] 
miss_calssified_EBI_test['Predicted-CLASS'] = EBI_miss_pred
miss_calssified_EBI_test

In [25]:
# Get PolySearch dataset
ColNames = ['entity_1', 'entity_2', 'Association', 'PMID','Sentence']

Poly_doc_d_t = codecs.open(data_path+'PolySearch/'+'p1_disease_gene_testset.simple'+'.tsv','rU','UTF-8') 
Poly_target_disease = pd.read_csv(Poly_doc_d_t, sep='\t', na_filter = False, names = ColNames)
Poly_target_disease['CLASS'] = 'gene_disease'

doc_embeddings = sif_embedding_wrapper.sentences2vecs(Poly_target_disease["Sentence"], embs, words, weight4ind)
Poly_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [28]:
Poly_test_ground_truth = {}

for idx, row in Poly_target_disease.iterrows():
    Poly_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

Poly_test_predictions = {}

for idx, row in Poly_target_disease.iterrows():
    max_sim = 0.00
    winner = 'other'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    Poly_test_predictions[idx] = winner   

In [29]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(Poly_test_predictions, Poly_test_ground_truth)
score    
# test_predictions

  'recall', 'true', average, warn_for)


0.82432432432432434

In [None]:
miss_calssified_Poly_test = Poly_target_disease.loc[Poly_miss_classified,['Sentence', 'CLASS']]
miss_calssified_Poly_test['Predicted-CLASS'] = Poly_miss_pred
miss_calssified_Poly_test