In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from operator import itemgetter
from itertools import cycle, islice
import pandas as pd
import numpy as np
import sif_embedding_wrapper
import utils
import itertools
import os

In [2]:
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)
# model.save_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.txt', binary=False)

# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-FS.txt", 
#                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.300d.txt", 
                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

In [4]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'



In [5]:
df = EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)
df.reset_index(inplace=True)

In [6]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(df["SENTENCE"], embs, words, weight4ind)
df["vector"] = pd.Series(list(doc_embeddings))


In [10]:
ground_truth = {}

for idx, row in df.iterrows():
    ground_truth[idx] = row['CLASS']

# ground_truth

# inv_map = {}
# for k, v in ground_truth.items():
#     inv_map[v] = inv_map.get(v, [])
#     inv_map[v].append(k)
    
# inv_map  

In [9]:
categories = list(df["CLASS"].unique())
categories

['drug_gene', 'gene_disease', 'drug_disease']

In [11]:
# Use LDA to get the topics and assign to class to find top performaning sentences

min_text_length=80
max_iter=150
batch_size=128
learning_offset=300.
n_topics = len(categories)


docs = df

unclassifiable = list(docs[docs["SENTENCE"].map(len) < min_text_length].index)
filtered = docs[~docs.index.isin(unclassifiable)]
ids = [d for d in list(filtered.index)[0:10]]

n_features = 5000
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=0.1,
    max_features=n_features)
tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'SENTENCE']))

lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    batch_size=batch_size,
    learning_method='online',
    learning_offset=learning_offset,
    random_state=0)

lda.fit(tf)
doc_topics = lda.transform(tf)

In [25]:
topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
doc_id = filtered.index

for idx, probs in enumerate(doc_topics):
    score = max(probs)
    topic = np.argmax(probs)
    doc_id = filtered.index[idx]
    topic_leaders["topic_{}".format(topic)].append({"doc_id": doc_id, "score": score})

for i in iter(range(n_topics)):
    topic_leaders["topic_{}".format(i)] = sorted(
        topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)
        
    

topic_leaders

{'topic_0': [{'doc_id': 260, 'score': 0.7238185053008797},
  {'doc_id': 261, 'score': 0.7238185053008797},
  {'doc_id': 273, 'score': 0.7238185053008797},
  {'doc_id': 277, 'score': 0.7238185053008797},
  {'doc_id': 278, 'score': 0.7238185053008797},
  {'doc_id': 322, 'score': 0.7238185053008797},
  {'doc_id': 325, 'score': 0.7238185053008797},
  {'doc_id': 326, 'score': 0.7238185053008797},
  {'doc_id': 327, 'score': 0.7238185053008797},
  {'doc_id': 328, 'score': 0.7238185053008797},
  {'doc_id': 346, 'score': 0.7238185053008797},
  {'doc_id': 347, 'score': 0.7238185053008797},
  {'doc_id': 359, 'score': 0.7238185053008797},
  {'doc_id': 362, 'score': 0.7238185053008797},
  {'doc_id': 367, 'score': 0.7238185053008797},
  {'doc_id': 471, 'score': 0.7238185053008797},
  {'doc_id': 472, 'score': 0.7238185053008797},
  {'doc_id': 503, 'score': 0.7238185053008797},
  {'doc_id': 558, 'score': 0.7238185053008797},
  {'doc_id': 570, 'score': 0.7238185053008797},
  {'doc_id': 226, 'score': 0.

In [63]:
# select only those sentences which have score more than 65%

sentences = {c:[] for c in categories} 
selected_sentences = {c:[] for c in categories}
sentences_with_score = {c:[] for c in categories}


for each_topic in topic_leaders:
    for each_doc in topic_leaders[each_topic]:
        gt = ground_truth[each_doc['doc_id']]
        sentences[gt].append(each_doc['doc_id'])
        sentences_with_score[gt].append(each_doc['score'])
#         print(each_doc['score'])
        if each_doc['score']>0.66:
             selected_sentences[gt].append(each_doc['doc_id'])
    
selected_sentences

{'drug_gene': [226,
  227,
  228,
  235,
  32,
  33,
  34,
  38,
  39,
  205,
  206,
  210,
  216,
  219,
  0,
  2,
  3,
  4,
  5,
  8,
  9,
  10,
  11,
  12,
  134,
  135,
  147,
  167,
  181,
  184,
  189,
  192,
  194,
  221,
  242,
  244,
  246,
  7,
  13,
  14,
  15,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  52,
  67,
  71,
  72,
  74,
  75,
  85,
  101,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  119,
  120,
  123,
  124,
  127,
  136,
  161,
  165,
  168,
  169,
  170,
  171,
  172,
  213,
  218,
  222],
 'gene_disease': [260,
  261,
  273,
  277,
  278,
  322,
  325,
  326,
  327,
  328,
  346,
  347,
  359,
  362,
  367,
  471,
  472,
  503,
  558,
  570,
  248,
  249,
  283,
  285,
  293,
  294,
  318,
  319,
  320,
  338,
  357,
  360,
  363,
  364,
  366,
  371,
  372,
  375,
  381,
  383,
  384,
  386,
  387,
  389,
  415,
  464,
  483,
  484,
  487,
  488,
  489,
  491,
  500,
  501,
  502,
  505,
  507,
  508,
  519,
  520,

In [64]:
len()

85

In [56]:
# Get average/mean of the sentence vectors that represent our topics 
category_vecs = {}
for c in categories:
    vectors = np.asarray(list(df.loc[df.index.isin(selected_sentences[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

    
category_vecs    

{'drug_gene': array([-9.21763579e-02,  6.48885958e-02,  6.85458034e-02, -7.99785523e-04,
         3.01569100e-02,  3.40098047e-02,  3.58546743e-02, -6.11273501e-02,
         4.65268562e-02, -4.79792556e-01, -6.38321862e-02, -7.82524306e-02,
        -1.78122382e-01, -8.08226686e-02,  9.03406505e-02, -1.08390579e-01,
        -5.26091398e-02, -4.97958569e-02, -1.56377439e-01,  3.29611904e-02,
        -2.63228520e-01, -1.18416566e-01, -3.56999115e-02, -4.10079990e-02,
        -3.57352800e-03, -1.79631564e-03,  1.00609849e-01, -1.13228309e-01,
        -8.49533862e-02, -6.12611405e-03, -5.09281553e-02, -1.36621885e-03,
        -2.50830594e-02, -1.39602233e-01,  2.37592623e-01,  9.86697789e-02,
         7.96388733e-02,  3.36417889e-02, -2.40634884e-02,  3.43490317e-03,
        -2.54291986e-02,  2.20848544e-01, -8.73416769e-02, -3.24884805e-01,
        -7.41249685e-02, -3.34264984e-02,  6.33560895e-02, -7.05350135e-02,
         6.05506355e-02,  4.36142824e-02, -4.70595265e-02,  2.83882513e-02,

In [74]:
# Try to predict the label of unknown sentences

predictions = {}

selected_idx = [j for i in selected_sentences.values() for j in i]

for idx, row in df.iterrows():
    if idx in selected_idx:
        max_sim = 0
        winner = categories[1]
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[idx] = winner
    

In [97]:
from sklearn.metrics import f1_score

def get_accuracy_score(predictions, truth_dict):
    preds = []
    labels = []
    mis_classified = []
    
    for k,v in predictions.items():
        preds.append(v)
        labels.append(truth_dict[k])
        if v!=truth_dict[k]:
            print(str(v) + '--x--' + str(truth_dict[k]))
            mis_classified.append(k)

    return f1_score(labels, preds, average='weighted'), mis_classified


score, miss_classified = get_accuracy_score(predictions, ground_truth)


drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
gene_disease--x--drug_gene
gene_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--drug_gene
drug_disease--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_disease--x--gene_disease
drug_disease--x--gene_disease
drug_disease--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_disease--x--gene_disease
drug_gene--x--gene_disease
drug_disease--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene--x--gene_disease
drug_gene-

In [98]:
miss_classified

[11,
 14,
 15,
 34,
 161,
 181,
 184,
 189,
 192,
 194,
 221,
 244,
 264,
 266,
 269,
 318,
 319,
 320,
 354,
 356,
 494,
 511,
 513,
 514,
 515,
 516,
 517,
 518,
 521,
 523,
 526,
 527,
 528,
 529,
 555,
 556,
 589,
 591,
 595,
 596,
 599,
 612,
 613,
 614,
 634,
 641,
 644,
 645,
 717,
 774,
 791]