In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from itertools import cycle, islice
from operator import itemgetter
import sif_embedding_wrapper
import pandas as pd
import numpy as np
import itertools
import codecs
import utils
import os

In [2]:
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)
# model.save_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.txt', binary=False)

words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-FS.txt", 
                                                     '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.300d.txt", 
#                                                       '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'



In [4]:
df = EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)
df.reset_index(inplace=True)

In [5]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(df["SENTENCE"], embs, words, weight4ind)
df["vector"] = pd.Series(list(doc_embeddings))


In [6]:
ground_truth = {}

for idx, row in df.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

# inv_map = {}
# for k, v in ground_truth.items():
#     inv_map[v] = inv_map.get(v, [])
#     inv_map[v].append(k)
    
# inv_map  

In [7]:
categories = list(df["CLASS"].unique())
categories

['drug_gene', 'gene_disease', 'drug_disease']

In [8]:
# Use LDA to get the topics and assign to class to find top performaning sentences

min_text_length=80
max_iter=150
batch_size=512
learning_offset=300.
n_topics = len(categories)


docs = df

unclassifiable = list(docs[docs["SENTENCE"].map(len) < min_text_length].index)
filtered = docs[~docs.index.isin(unclassifiable)]
ids = [d for d in list(filtered.index)[0:10]]

n_features = 50000
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=0.1,
    max_features=n_features)
tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'SENTENCE']))

lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    batch_size=batch_size,
    learning_method='online',
    learning_offset=learning_offset,
    random_state=0)

lda.fit(tf)
doc_topics = lda.transform(tf)

In [9]:
topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
doc_id = filtered.index

for idx, probs in enumerate(doc_topics):
    score = max(probs)
    topic = np.argmax(probs)
    doc_id = filtered.index[idx]
    topic_leaders["topic_{}".format(topic)].append({"doc_id": doc_id, "score": score})

for i in iter(range(n_topics)):
    topic_leaders["topic_{}".format(i)] = sorted(
        topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)
        
    

# topic_leaders

In [10]:
# select only those sentences which have score more than 65%

sentences = {c:[] for c in categories} 
selected_sentences = {c:[] for c in categories}
sentences_with_score = {c:[] for c in categories}


for each_topic in topic_leaders:
    for each_doc in topic_leaders[each_topic]:
        gt = ground_truth[each_doc['doc_id']]
        sentences[gt].append(each_doc['doc_id'])
        sentences_with_score[gt].append(each_doc['score'])
#         print(each_doc['score'])
        if each_doc['score']>0.66:
             selected_sentences[gt].append(each_doc['doc_id'])
    
# selected_sentences

In [11]:
# Get average/mean of the sentence vectors that represent our topics 
category_vecs = {}
for c in categories:
    vectors = np.asarray(list(df.loc[df.index.isin(selected_sentences[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

    
category_vecs    

{'drug_gene': array([-3.21358114e-02,  7.47584684e-02,  8.34615362e-02,  5.54770049e-02,
         1.22370782e-01,  6.47800698e-02,  1.48554523e-02, -4.29823855e-02,
        -5.56380577e-03, -1.21374275e-02,  1.48823383e-02, -7.69251622e-02,
         6.65993555e-02, -1.31682239e-03, -2.55387447e-02, -5.52268020e-02,
        -1.05548672e-01, -1.75949691e-01,  1.94938574e-02,  2.32216645e-02,
        -5.88295000e-02, -8.70603118e-02, -4.02607696e-02, -5.23781347e-02,
         3.46875427e-02, -6.53689495e-02,  3.71152185e-02, -6.13846829e-03,
        -2.33896747e-03, -7.99919579e-02,  2.72428694e-02, -1.00196057e-01,
        -1.26801660e-01, -1.43025047e-02,  6.74936439e-02, -9.01519123e-02,
        -3.06453595e-02,  3.62881264e-04, -8.61218259e-02, -9.98713983e-02,
         8.52346434e-03, -4.63008804e-02,  7.61586987e-02,  6.02405544e-02,
         1.49043512e-01, -4.74789195e-02, -1.12607301e-01, -8.31325829e-02,
         3.59362343e-02, -4.91476457e-02, -5.04195978e-02, -1.78867236e-02,

In [12]:
# Try to predict the label of unknown sentences

predictions = {}

selected_idx = [j for i in selected_sentences.values() for j in i]

for idx, row in df.iterrows():
    if idx in selected_idx:
        max_sim = 0
        winner = 'Unknown'
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[idx] = winner
    

In [13]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_pred.append(str(v))
            mis_classified.append(k)

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified_df, miss_pred = get_accuracy_score(predictions, ground_truth)
score

0.8615150850521486

In [14]:
pd.set_option('display.max_colwidth', -1)

miss_calssified_df = df.iloc[miss_classified_df , [12,13]] 
miss_calssified_df['Predicted-CLASS'] = miss_pred
miss_calssified_df

Unnamed: 0,SENTENCE,CLASS,Predicted-CLASS
11,The expression of ABCG2 may explain in part the ineffectiveness of doxorubicin as a single modality treatment for anaplastic thyroid cancer or for treatment of metastatic follicular thyroid cancer.,drug_gene,drug_disease
14,Oral administration of GSK221149A (5 mg/kg) was effective in inhibiting oxytocin-induced uterine contractions after single and multiple (4-day) dosing.,drug_gene,drug_disease
15,Intravenous administration of GSK221149A produced a dose-dependent decrease in oxytocin-induced uterine contractions in anesthetized rats with an ID(50) = 0.27 +/- 0.60 mg/kg (corresponding plasma concentrations were 88 ng/ml).,drug_gene,drug_disease
189,Decreased plasma soluble RAGE in patients with hypercholesterolemia: effects of statins.,drug_gene,drug_disease
242,A continuous epidural infusion of ropivacaine inhibits CYP2D6 activity in patients who are extensive metabolizers resulting in a twofold increase in the MR for debrisoquine hydroxylation.,drug_gene,drug_disease
244,Patients received 10 mg debrisoquine (a marker for CYP2D6 activity) before surgery and after 40 h epidural infusion.,drug_gene,drug_disease
246,A continuous epidural infusion of ropivacaine inhibits CYP2D6 activity in patients who are extensive metabolizers resulting in a twofold increase in the MR for debrisoquine hydroxylation.,drug_gene,drug_disease
264,"The prognostic value of the T393C SNP was evaluated in an unselected series of patients treated with curative intent for oropharyngeal and hypopharyngeal squamous cell carcinomas, including all tumor stages with different therapeutic regimens.",gene_disease,drug_disease
265,"However, the majority of colon cancer cells have deregulation of the Wnt/beta-catenin pathway.",gene_disease,drug_gene
266,Lysophosphatidic acid facilitates proliferation of colon cancer cells via induction of Krppel-like factor 5.,gene_disease,drug_gene


In [15]:
result_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Results'))+'/'
miss_calssified_df.to_csv(result_path+'miss_predictions_train.csv')

In [16]:
# Generalisation on the Unseen Dataset GAD


doc_t_d = codecs.open(data_path+'GAD_Corpus_IBIgroup/'+'GAD_Y_N'+'.csv','rU','UTF-8',errors='ignore') 
GAD_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
GAD_target_disease['CLASS'] = 'gene_disease'


# GAD_target_disease.head(10)


In [17]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(GAD_target_disease["GAD_CONCLUSION"], embs, words, weight4ind)
GAD_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [18]:
test_ground_truth = {}

for idx, row in GAD_target_disease.iterrows():
    test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

test_predictions = {}

for idx, row in GAD_target_disease.iterrows():
    max_sim = 0.60
    winner = 'unknown'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    test_predictions[idx] = winner    

In [19]:
def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(test_predictions, test_ground_truth)
score

  'recall', 'true', average, warn_for)


0.9487708763370238

In [20]:
miss_calssified_df_test = GAD_target_disease.iloc[miss_classified , [10,11]] 
miss_calssified_df_test['Predicted-CLASS'] = miss_pred
miss_calssified_df_test

Unnamed: 0,GAD_CONCLUSION,CLASS,Predicted-CLASS
7,The novel gene HCCA2 may be related with the infiltration and proliferation of liver cancer.,gene_disease,drug_gene
10,"We found no evidence that mutation in GUCA1B,GNGT1,or RGS9 gene is a cause of retinitis pigmentosa.",gene_disease,unknown
39,"Because our samples provided quite high power, these results indicate that GSK3B may not play a major role in Japanese schizophrenia.",gene_disease,unknown
52,We conclude that GRIK2 does not play a major role in the pathogenesis of schizophrenia in the Japanese population.,gene_disease,unknown
53,We conclude that GRIK2 does not play a major role in the pathogenesis of schizophrenia in the Japanese population.,gene_disease,unknown
54,We conclude that GRIK1 does not play a major role in schizophrenia pathogenesis in the Japanese population.,gene_disease,unknown
55,We conclude that GRIK1 does not play a major role in schizophrenia pathogenesis in the Japanese population.,gene_disease,unknown
60,Determination of VNTR of the GPIba gene may prove useful for identifying high-risk individuals for MI.,gene_disease,unknown
62,"We found no evidence that mutation in GUCA1B,GNGT1,or RGS9 gene is a cause of retinitis pigmentosa.",gene_disease,unknown
81,Our results suggest that GAD2 does not play a major role in type 1 diabetes in these two European populations.,gene_disease,unknown


In [21]:
miss_calssified_df_test.to_csv(result_path+'miss_predictions_test.csv')

In [22]:
#Final model

final_data_1 = df[['SENTENCE', 'CLASS']][~df.index.isin(miss_classified_df)] 
final_data_2 = GAD_target_disease[['GAD_CONCLUSION', 'CLASS']][~GAD_target_disease.index.isin(miss_classified)]

final_data_2.rename(columns={"GAD_CONCLUSION": "SENTENCE"}, inplace = True)

final_data = final_data_1.append(final_data_2)

final_data.reset_index(inplace=True)


doc_embeddings = sif_embedding_wrapper.sentences2vecs(final_data["SENTENCE"], embs, words, weight4ind)
final_data["vector"] = pd.Series(list(doc_embeddings))

In [23]:
ground_truth = {}

for idx, row in final_data.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

inv_map = {}
for k, v in ground_truth.items():
    inv_map[v] = inv_map.get(v, [])
    inv_map[v].append(k)
    
# inv_map  

In [24]:
# Get average/mean of the sentence vectors that represent our topics 

categories = list(final_data["CLASS"].unique())
print(categories)


category_vecs_ = {}
for c in categories:
    vectors = np.asarray(list(final_data.loc[final_data.index.isin(inv_map[c])].vector))
    category_vecs_[c] = np.mean(vectors, axis=0)

    
category_vecs_   

['drug_gene', 'gene_disease', 'drug_disease']


{'drug_gene': array([-0.02259501,  0.06900093,  0.08034494,  0.05128672,  0.109773  ,
         0.07043651,  0.01890659, -0.04447399,  0.00232815, -0.03183196,
         0.02219459, -0.06560161,  0.05480214, -0.0008662 , -0.03625138,
        -0.03070597, -0.10438451, -0.16020964,  0.00480927,  0.02563161,
        -0.06444762, -0.0747943 , -0.02560701, -0.05884123,  0.03714224,
        -0.07155826,  0.03770538, -0.0045408 ,  0.00113782, -0.07515798,
         0.04877341, -0.08112333, -0.10605722, -0.007455  ,  0.06109701,
        -0.08524513, -0.01665239, -0.00193276, -0.08086548, -0.09113211,
         0.01160008, -0.04256436,  0.06923264,  0.05370258,  0.13953933,
        -0.04828128, -0.10773625, -0.08710372,  0.03560877, -0.0440347 ,
        -0.04703573, -0.00654404, -0.05748476,  0.00903911, -0.0902285 ,
         0.02511453,  0.08825264,  0.00662434, -0.02267997,  0.02906621,
         0.01720275, -0.04489324, -0.01539939, -0.0211833 ,  0.00863589,
         0.01029012, -0.00341215,  0.0

In [25]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_gene': 0.27992935769402444,
 'gene_disease': 0.6860943582066192,
 'drug_disease': 0.36788278692274523}

In [31]:
# Test new sentence

test_sample = 'This study assessed associations between the CYP4F2 gene and myocardial infarction (MI), using a haplotype-based case-control study of 234 MI patients and 248 controls genotyped for 5 single-nucleotide polymorphisms (rs3093105, rs3093135, rs1558139, rs2108622, rs3093200).'
# test_sample = 'Assessment of 1177 human immunodeficiency virus (HIV) resistance genotypes at an HIV/AIDS clinic showed a decrease in the incidence of the K65R mutation, from 15.2% of isolates during the period 2002-2004 to 2.7% of isolates during the period 2005-2006 (P < .001), despite elevated and stable rates of tenofovir use.'
# test_sample = 'Doxorubicin-induced DNA damage was also specifically abolished by the proteasome inhibitors bortezomib and MG132 and much reduced in top2beta(-/-) mouse embryonic fibroblasts (MEF) compared with TOP2beta(+/+) MEFs, suggesting the involvement of proteasome and DNA topoisomerase IIbeta (Top2beta).'
# test_sample = 'SLC9A6 at Xq26.3 (Gilfillan et al., 2008)X-linked mental retardation'
# test_sample = 'DLBCL was identified by a microenvironment gene expression signature and is associated with increased expression of inflammatory mediators, such as multiple components of the T-cell receptor (TCR), molecules associated with T/NK-cell activation and the complement cascade, downstream targets of IFNγ'

test_embedding = sif_embedding_wrapper.sentences2vecs([test_sample], embs, words, weight4ind)

sim = {}
for j in category_vecs_:
    sim[j] = cosine_similarity(test_embedding.reshape(1, -1), category_vecs_[j].reshape(1, -1)).flatten()[0]

sim    
    

{'drug_gene': 0.27115843650125687,
 'gene_disease': 0.6521062059914209,
 'drug_disease': 0.3627070600955426}

In [32]:
# Testing on the 30 papers dataset


import json

with open(data_path+'EBI Standard/'+'rel_data_bronze.json') as json_file:
    data = json.load(json_file)

EBI_standard = []

for each_point in data:
    for each_sent in data[each_point]:
        EBI_standard.append({'SENTENCE': each_sent['sent'], 'CLASS': each_sent['rel']})    
        
EBI_standard_temp = pd.DataFrame(EBI_standard)   

EBI_standard_temp['CLASS'] = EBI_standard_temp['CLASS'].apply(lambda x: x.replace('YGD', 'gene_disease').replace('NGD', 'other').replace('AMB', 'other'))
EBI_standard_temp.to_csv(data_path+'EBI_bronze_standard.csv')

In [38]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(EBI_standard_temp["SENTENCE"], embs, words, weight4ind)
EBI_standard_temp["vector"] = pd.Series(list(doc_embeddings))


EBI_test_ground_truth = {}

for idx, row in EBI_standard_temp.iterrows():
    EBI_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

EBI_test_predictions = {}

for idx, row in EBI_standard_temp.iterrows():
    max_sim = 0.0
    winner = 'other'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    EBI_test_predictions[idx] = winner    
    

def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    mis_pred = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
#             print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)
            mis_pred.append(str(v))

    return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


score, miss_classified, miss_pred = get_accuracy_score(EBI_test_predictions, EBI_test_ground_truth)
score    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.48178137651821856

In [37]:
miss_calssified_EBI_test = EBI_standard_temp.iloc[miss_classified , [0,1]] 
miss_calssified_EBI_test['Predicted-CLASS'] = miss_pred
miss_calssified_EBI_test

Unnamed: 0,CLASS,SENTENCE,Predicted-CLASS
0,other,"As most tobacco control programs in Bangladesh target mainly cigarette or bidi smoking, coordinated programs are needed that will also include SLT use within the tobacco control policy and prevention strategies.",drug_disease
1,other,"SLT can cause oral cancer [4], [5] and nicotine addiction [6] and is associated with several other health conditions including oral pain [7], cardiovascular diseases [8], hypertension [9], diabetes [10], loss in bone density [11], and problems during pregnancy and following childbirth [12].",drug_disease
3,other,This increased likelihood of SLT use is related to the social acceptance of SLT use by the older people and a greater appeal of cigarette among the younger generations who may be taking up smoking instead of SLT use.,gene_disease
4,other,"Use of SLT may provide a supplementation or substitution of nicotine intake for cigarette smokers [31] encouraging their continued use; this scenario could be particularly true in situations in which smoking is not socially acceptable, but SLT use is.",drug_disease
5,other,"As most tobacco control programs target mainly cigarette or bidi smoking, coordinated programs are needed that will also include SLT use within the tobacco control policy and prevention strategies.",gene_disease
6,other,"Interleukin (IL)-1β, IL-4, IL-6, IL-10, IL-12, IL-13, IL-17, interferon (IFN)-γ, chemokine C-C motif ligand 5 (CCL5) and tumor necrosis factor (TNF)-α were tested in tear samples and sera of keratoconus and control individuals by multiplex immuno-bead assays.",drug_gene
20,gene_disease,"Of the TH2-related cytokines, the decrease in IL-13 was statistically significant in severe keratoconus versus control subjects.",drug_disease
25,gene_disease,"Based on these limited data (Figure 3), there was a trend towards increased IL-17 (72.5±59.2 pg/ml) in keratoconus tear fluids compared to control samples (18.5±7.8 pg/ml).",drug_disease
31,gene_disease,"IL-12 promotes the differentiation of TH1 cells; its decrease in keratoconus is consistent with decreases in two signature TH1 cytokines, IFN-γ and TNF-α.",drug_gene
33,gene_disease,"Both IL-4 and IL-13 cytokines were also reduced in keratoconus, and the decrease in IL-4, as measured by conventional ELISA, was statistically significant.",drug_gene


In [39]:
# Get PolySearch dataset
ColNames = ['entity_1', 'entity_2', 'Association', 'PMID','Sentence']
Poly_doc_d_t = codecs.open(data_path+'PolySearch/'+'p1_disease_gene_testset.simple'+'.tsv','rU','UTF-8') 
Poly_target_disease = pd.read_csv(Poly_doc_d_t, sep='\t', na_filter = False, names = ColNames)
Poly_target_disease['CLASS'] = 'gene_disease'

In [40]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(Poly_target_disease["Sentence"], embs, words, weight4ind)
Poly_target_disease["vector"] = pd.Series(list(doc_embeddings))

In [54]:
Poly_test_ground_truth = {}

for idx, row in Poly_target_disease.iterrows():
    Poly_test_ground_truth[idx] = row['CLASS']
    
# Try to predict the label of unknown sentences

test_predictions = {}

for idx, row in GAD_target_disease.iterrows():
    max_sim = 0.00
    winner = 'unknown'
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    test_predictions[idx] = winner    
    



In [60]:
# def get_accuracy_score(predictions, truth_dict):
#     preds = []
#     labels = []
#     mis_classified = []
#     mis_pred = []
    
#     for k,v in predictions.items():
#         preds.append(v)
#         labels.append(truth_dict[k])
#         if v!=truth_dict[k]:
# #             print(str(v) + '--x--' + str(truth_dict[k]))
#             mis_classified.append(k)
#             mis_pred.append(str(v))

#     return f1_score(labels, preds, average='weighted'), mis_classified, mis_pred


# score, miss_classified, miss_pred = get_accuracy_score(test_predictions, test_ground_truth)
# score    