In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from itertools import cycle, islice
from operator import itemgetter
import sif_embedding_wrapper
import pandas as pd
import numpy as np
import itertools
import codecs
import utils
import os

In [2]:
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)
# model.save_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.txt', binary=False)

words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-FS.txt", 
                                                     '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.300d.txt", 
#                                                       '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'



In [4]:
df = EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)
df.reset_index(inplace=True)

In [5]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(df["SENTENCE"], embs, words, weight4ind)
df["vector"] = pd.Series(list(doc_embeddings))


In [6]:
ground_truth = {}

for idx, row in df.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

# inv_map = {}
# for k, v in ground_truth.items():
#     inv_map[v] = inv_map.get(v, [])
#     inv_map[v].append(k)
    
# inv_map  

In [7]:
categories = list(df["CLASS"].unique())
categories

['drug_gene', 'gene_disease', 'drug_disease']

In [8]:
# Use LDA to get the topics and assign to class to find top performaning sentences

min_text_length=80
max_iter=150
batch_size=128
learning_offset=300.
n_topics = len(categories)


docs = df

unclassifiable = list(docs[docs["SENTENCE"].map(len) < min_text_length].index)
filtered = docs[~docs.index.isin(unclassifiable)]
ids = [d for d in list(filtered.index)[0:10]]

n_features = 5000
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=0.1,
    max_features=n_features)
tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'SENTENCE']))

lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    batch_size=batch_size,
    learning_method='online',
    learning_offset=learning_offset,
    random_state=0)

lda.fit(tf)
doc_topics = lda.transform(tf)

In [9]:
topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
doc_id = filtered.index

for idx, probs in enumerate(doc_topics):
    score = max(probs)
    topic = np.argmax(probs)
    doc_id = filtered.index[idx]
    topic_leaders["topic_{}".format(topic)].append({"doc_id": doc_id, "score": score})

for i in iter(range(n_topics)):
    topic_leaders["topic_{}".format(i)] = sorted(
        topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)
        
    

topic_leaders

{'topic_0': [{'doc_id': 260, 'score': 0.7238185053008797},
  {'doc_id': 261, 'score': 0.7238185053008797},
  {'doc_id': 273, 'score': 0.7238185053008797},
  {'doc_id': 277, 'score': 0.7238185053008797},
  {'doc_id': 278, 'score': 0.7238185053008797},
  {'doc_id': 322, 'score': 0.7238185053008797},
  {'doc_id': 325, 'score': 0.7238185053008797},
  {'doc_id': 326, 'score': 0.7238185053008797},
  {'doc_id': 327, 'score': 0.7238185053008797},
  {'doc_id': 328, 'score': 0.7238185053008797},
  {'doc_id': 346, 'score': 0.7238185053008797},
  {'doc_id': 347, 'score': 0.7238185053008797},
  {'doc_id': 359, 'score': 0.7238185053008797},
  {'doc_id': 362, 'score': 0.7238185053008797},
  {'doc_id': 367, 'score': 0.7238185053008797},
  {'doc_id': 471, 'score': 0.7238185053008797},
  {'doc_id': 472, 'score': 0.7238185053008797},
  {'doc_id': 503, 'score': 0.7238185053008797},
  {'doc_id': 558, 'score': 0.7238185053008797},
  {'doc_id': 570, 'score': 0.7238185053008797},
  {'doc_id': 226, 'score': 0.

In [10]:
# select only those sentences which have score more than 65%

sentences = {c:[] for c in categories} 
selected_sentences = {c:[] for c in categories}
sentences_with_score = {c:[] for c in categories}


for each_topic in topic_leaders:
    for each_doc in topic_leaders[each_topic]:
        gt = ground_truth[each_doc['doc_id']]
        sentences[gt].append(each_doc['doc_id'])
        sentences_with_score[gt].append(each_doc['score'])
#         print(each_doc['score'])
        if each_doc['score']>0.66:
             selected_sentences[gt].append(each_doc['doc_id'])
    
selected_sentences

{'drug_gene': [226,
  227,
  228,
  235,
  32,
  33,
  34,
  38,
  39,
  205,
  206,
  210,
  216,
  219,
  0,
  2,
  3,
  4,
  5,
  8,
  9,
  10,
  11,
  12,
  134,
  135,
  147,
  167,
  181,
  184,
  189,
  192,
  194,
  221,
  242,
  244,
  246,
  7,
  13,
  14,
  15,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  52,
  67,
  71,
  72,
  74,
  75,
  85,
  101,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  119,
  120,
  123,
  124,
  127,
  136,
  161,
  165,
  168,
  169,
  170,
  171,
  172,
  213,
  218,
  222],
 'gene_disease': [260,
  261,
  273,
  277,
  278,
  322,
  325,
  326,
  327,
  328,
  346,
  347,
  359,
  362,
  367,
  471,
  472,
  503,
  558,
  570,
  248,
  249,
  283,
  285,
  293,
  294,
  318,
  319,
  320,
  338,
  357,
  360,
  363,
  364,
  366,
  371,
  372,
  375,
  381,
  383,
  384,
  386,
  387,
  389,
  415,
  464,
  483,
  484,
  487,
  488,
  489,
  491,
  500,
  501,
  502,
  505,
  507,
  508,
  519,
  520,

In [51]:
# Get average/mean of the sentence vectors that represent our topics 
category_vecs = {}
for c in categories:
    vectors = np.asarray(list(df.loc[df.index.isin(selected_sentences[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

    
category_vecs    

{'drug_gene': array([-3.21358114e-02,  7.47584684e-02,  8.34615362e-02,  5.54770049e-02,
         1.22370782e-01,  6.47800698e-02,  1.48554523e-02, -4.29823855e-02,
        -5.56380577e-03, -1.21374275e-02,  1.48823383e-02, -7.69251622e-02,
         6.65993555e-02, -1.31682239e-03, -2.55387447e-02, -5.52268020e-02,
        -1.05548672e-01, -1.75949691e-01,  1.94938574e-02,  2.32216645e-02,
        -5.88295000e-02, -8.70603118e-02, -4.02607696e-02, -5.23781347e-02,
         3.46875427e-02, -6.53689495e-02,  3.71152185e-02, -6.13846829e-03,
        -2.33896747e-03, -7.99919579e-02,  2.72428694e-02, -1.00196057e-01,
        -1.26801660e-01, -1.43025047e-02,  6.74936439e-02, -9.01519123e-02,
        -3.06453595e-02,  3.62881264e-04, -8.61218259e-02, -9.98713983e-02,
         8.52346434e-03, -4.63008804e-02,  7.61586987e-02,  6.02405544e-02,
         1.49043512e-01, -4.74789195e-02, -1.12607301e-01, -8.31325829e-02,
         3.59362343e-02, -4.91476457e-02, -5.04195978e-02, -1.78867236e-02,

In [12]:
# Try to predict the label of unknown sentences

predictions = {}

selected_idx = [j for i in selected_sentences.values() for j in i]

for idx, row in df.iterrows():
    if idx in selected_idx:
        max_sim = 0
        winner = 'Unknown'
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[idx] = winner
    

In [74]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
            print(str(v) + '--x--' + str(truth_dict[k]))
            mis_pred.append(str(v))
            mis_classified.append(k)

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified_df, miss_pred = get_accuracy_score(predictions, ground_truth)


drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_disease--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_disease--x--gene_disease
drug_disease--x--gene_disease
drug_gene--x--ge

In [14]:
miss_calssified_df = df.iloc[miss_classified , [12,13]] 
miss_calssified_df['Predicted-CLASS'] = miss_pred
miss_calssified_df

Unnamed: 0,SENTENCE,CLASS,Predicted-CLASS
11,The expression of ABCG2 may explain in part th...,drug_gene,drug_disease
14,Oral administration of GSK221149A (5 mg/kg) wa...,drug_gene,drug_disease
15,Intravenous administration of GSK221149A produ...,drug_gene,drug_disease
189,Decreased plasma soluble RAGE in patients with...,drug_gene,drug_disease
242,A continuous epidural infusion of ropivacaine ...,drug_gene,drug_disease
244,Patients received 10 mg debrisoquine (a marker...,drug_gene,drug_disease
246,A continuous epidural infusion of ropivacaine ...,drug_gene,drug_disease
264,The prognostic value of the T393C SNP was eval...,gene_disease,drug_disease
265,"However, the majority of colon cancer cells ha...",gene_disease,drug_gene
266,Lysophosphatidic acid facilitates proliferatio...,gene_disease,drug_gene


In [15]:
result_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Results'))+'/'
miss_calssified_df.to_csv(result_path+'miss_predictions_train.csv')

In [16]:
# Generalisation on the Unseen Dataset GAD


doc_t_d = codecs.open(data_path+'GAD_Corpus_IBIgroup/'+'GAD_Y_N'+'.csv','rU','UTF-8',errors='ignore') 
GAD_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
GAD_target_disease['CLASS'] = 'gene_disease'


GAD_target_disease


Unnamed: 0,GAD_ID,GAD_ASSOC,GAD_GENE_SYMBOL,GAD_GENE_NAME,GAD_ENTREZ_ID,NER_GENE_ENTITY,NER_GENE_OFFSET,GAD_DISEASE_NAME,NER_DISEASE_ENTITY,NER_DISEASE_OFFSET,GAD_CONCLUSION,CLASS
0,125111,N,HIF1A,"Hypoxia-inducible factor 1, alpha subunit (bas...",3091,HIF-1alpha,54#64,colorectal cancer,colorectal carcinoma,113#133,These results suggest that the C1772T polymorp...,gene_disease
1,125062,N,HFE,Hemochromatosis,3077,HFE,139#142,iron overload,iron overload,16#29,"In our setting, iron overload among alcoholic ...",gene_disease
2,125055,Y,HFE,Hemochromatosis,3077,hemochromatosis,73#88,cirrhosis,cirrhosis,35#44,MPO genotype GG is associated with cirrhosis i...,gene_disease
3,125019,N,HFE,Hemochromatosis,3077,HFE,86#89,Cardiovascular Disease,cardiovascular disease,141#163,These three studies do not provide consistent ...,gene_disease
4,125015,Y,HFE,Hemochromatosis,3077,HFE,63#66,Coronary Heart Disease,CHD,110#113,Our prospective findings suggest that individu...,gene_disease
5,125009,N,HEXB,Hexosaminidase B (beta polypeptide),3074,HEXB,225#229,Sandhoff disease,Sandhoff disease,107#123,We conclude that homozygosity for the G1514--...,gene_disease
6,124975,Y,HCRTR2,Hypocretin (orexin) receptor 2,3062,HCRTR2,26#32,cluster headache,cluster headache,93#109,The data suggest that the HCRTR2 gene or a lin...,gene_disease
7,124967,Y,HCCA2,HCCA2 protein,81532,HCCA2,15#20,liver cancer,liver cancer,79#91,The novel gene HCCA2 may be related with the i...,gene_disease
8,124920,Y,HAVCR1,Hepatitis A virus cellular receptor 1,26762,TIM-1,54#59,asthma,asthma,97#103,Our findings suggest that the genetic variants...,gene_disease
9,124896,Y,GYS1,Glycogen synthase 1 (muscle),2997,GYS1,41#45,"diabetes, type 2",type 2 diabetes,96#111,"In conclusion, the M416V polymorphism of GYS1 ...",gene_disease


In [17]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(GAD_target_disease["GAD_CONCLUSION"], embs, words, weight4ind)
GAD_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [18]:
test_ground_truth = {}

for idx, row in GAD_target_disease.iterrows():
    test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

test_predictions = {}

for idx, row in GAD_target_disease.iterrows():
    max_sim = 0.60
    winner = 'unknown'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    test_predictions[idx] = winner    

In [19]:
test_predictions

{0: 'gene_disease',
 1: 'gene_disease',
 2: 'gene_disease',
 3: 'gene_disease',
 4: 'gene_disease',
 5: 'gene_disease',
 6: 'gene_disease',
 7: 'drug_gene',
 8: 'gene_disease',
 9: 'gene_disease',
 10: 'unknown',
 11: 'gene_disease',
 12: 'gene_disease',
 13: 'gene_disease',
 14: 'gene_disease',
 15: 'gene_disease',
 16: 'gene_disease',
 17: 'gene_disease',
 18: 'gene_disease',
 19: 'gene_disease',
 20: 'gene_disease',
 21: 'gene_disease',
 22: 'gene_disease',
 23: 'gene_disease',
 24: 'gene_disease',
 25: 'gene_disease',
 26: 'gene_disease',
 27: 'gene_disease',
 28: 'gene_disease',
 29: 'gene_disease',
 30: 'gene_disease',
 31: 'gene_disease',
 32: 'gene_disease',
 33: 'gene_disease',
 34: 'gene_disease',
 35: 'gene_disease',
 36: 'gene_disease',
 37: 'gene_disease',
 38: 'gene_disease',
 39: 'unknown',
 40: 'gene_disease',
 41: 'gene_disease',
 42: 'gene_disease',
 43: 'gene_disease',
 44: 'gene_disease',
 45: 'gene_disease',
 46: 'gene_disease',
 47: 'gene_disease',
 48: 'gene_dise

In [20]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(test_predictions, test_ground_truth)
score

  'recall', 'true', average, warn_for)


0.9487708763370238

In [21]:
miss_calssified_df_test = GAD_target_disease.iloc[miss_classified , [10,11]] 
miss_calssified_df_test['Predicted-CLASS'] = miss_pred
miss_calssified_df_test

Unnamed: 0,GAD_CONCLUSION,CLASS,Predicted-CLASS
7,The novel gene HCCA2 may be related with the i...,gene_disease,drug_gene
10,"We found no evidence that mutation in GUCA1B,G...",gene_disease,unknown
39,"Because our samples provided quite high power,...",gene_disease,unknown
52,We conclude that GRIK2 does not play a major ...,gene_disease,unknown
53,We conclude that GRIK2 does not play a major ...,gene_disease,unknown
54,We conclude that GRIK1 does not play a major ...,gene_disease,unknown
55,We conclude that GRIK1 does not play a major ...,gene_disease,unknown
60,Determination of VNTR of the GPIba gene may pr...,gene_disease,unknown
62,"We found no evidence that mutation in GUCA1B,G...",gene_disease,unknown
81,Our results suggest that GAD2 does not play a ...,gene_disease,unknown


In [22]:
miss_calssified_df_test.to_csv(result_path+'miss_predictions_test.csv')

In [76]:
#Final model

final_data_1 = df[['SENTENCE', 'CLASS']][~df.index.isin(miss_classified_df)] 
final_data_2 = GAD_target_disease[['GAD_CONCLUSION', 'CLASS']][~GAD_target_disease.index.isin(miss_classified)]

final_data_2.rename(columns={"GAD_CONCLUSION": "SENTENCE"}, inplace = True)

final_data = final_data_1.append(final_data_2)

final_data.reset_index(inplace=True)


doc_embeddings = sif_embedding_wrapper.sentences2vecs(final_data["SENTENCE"], embs, words, weight4ind)
final_data["vector"] = pd.Series(list(doc_embeddings))

In [77]:
ground_truth = {}

for idx, row in final_data.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

inv_map = {}
for k, v in ground_truth.items():
    inv_map[v] = inv_map.get(v, [])
    inv_map[v].append(k)
    
inv_map  

{'drug_gene': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156

In [78]:
# Get average/mean of the sentence vectors that represent our topics 

categories = list(final_data["CLASS"].unique())
print(categories)


category_vecs_ = {}
for c in categories:
    vectors = np.asarray(list(final_data.loc[final_data.index.isin(inv_map[c])].vector))
    category_vecs_[c] = np.mean(vectors, axis=0)

    
category_vecs_   

['drug_gene', 'gene_disease', 'drug_disease']


{'drug_gene': array([-0.02259501,  0.06900093,  0.08034494,  0.05128672,  0.109773  ,
         0.07043651,  0.01890659, -0.04447399,  0.00232815, -0.03183196,
         0.02219459, -0.06560161,  0.05480214, -0.0008662 , -0.03625138,
        -0.03070597, -0.10438451, -0.16020964,  0.00480927,  0.02563161,
        -0.06444762, -0.0747943 , -0.02560701, -0.05884123,  0.03714224,
        -0.07155826,  0.03770538, -0.0045408 ,  0.00113782, -0.07515798,
         0.04877341, -0.08112333, -0.10605722, -0.007455  ,  0.06109701,
        -0.08524513, -0.01665239, -0.00193276, -0.08086548, -0.09113211,
         0.01160008, -0.04256436,  0.06923264,  0.05370258,  0.13953933,
        -0.04828128, -0.10773625, -0.08710372,  0.03560877, -0.0440347 ,
        -0.04703573, -0.00654404, -0.05748476,  0.00903911, -0.0902285 ,
         0.02511453,  0.08825264,  0.00662434, -0.02267997,  0.02906621,
         0.01720275, -0.04489324, -0.01539939, -0.0211833 ,  0.00863589,
         0.01029012, -0.00341215,  0.0

In [84]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_gene': 0.27992935769402444,
 'gene_disease': 0.6860943582066192,
 'drug_disease': 0.36788278692274523}

In [83]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs_:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs_[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_gene': 0.27115843650125687,
 'gene_disease': 0.6521062059914209,
 'drug_disease': 0.3627070600955426}