In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from operator import itemgetter
from itertools import cycle, islice
import pandas as pd
import numpy as np
import sif_embedding_wrapper
import utils
import itertools
import os

In [3]:
# from gensim.models.keyedvectors import KeyedVectors

# model = KeyedVectors.load_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)
# model.save_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.txt', binary=False)

# words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-FS.txt", 
#                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

words, embs, weight4ind = sif_embedding_wrapper.load_embeddings("/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.300d.txt", 
                                                      '/home/stirunag/pre-trained_word_embeddings/wiki/enwiki_vocab_min200.txt')

In [4]:
embs

array([[ 0.04656  ,  0.21318  , -0.0074364, ...,  0.0090611, -0.20989  ,
         0.053913 ],
       [-0.25539  , -0.25723  ,  0.13169  , ..., -0.2329   , -0.12226  ,
         0.35499  ],
       [-0.12559  ,  0.01363  ,  0.10306  , ..., -0.34224  , -0.022394 ,
         0.13684  ],
       ...,
       [ 0.075713 , -0.040502 ,  0.18345  , ...,  0.21838  ,  0.30967  ,
         0.43761  ],
       [ 0.81451  , -0.36221  ,  0.31186  , ...,  0.075486 ,  0.28408  ,
        -0.17559  ],
       [ 0.429191 , -0.296897 ,  0.15011  , ...,  0.28975  ,  0.32618  ,
        -0.0590532]])

In [5]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'



In [6]:
EUADR = {} 
temp = EUADR_drug_target.append(EUADR_target_disease)
EUADR['data'] = temp['SENTENCE'].tolist()
EUADR['target'] = temp['CLASS'].tolist()


In [7]:
print(str(EUADR_drug_target.__len__()) + '  ' + str(EUADR_target_disease.__len__()))

247  355


In [8]:
docs = {}
for i,text in enumerate(EUADR['data']):
    doc_id = str(i+1)
    docs[doc_id] = {
        "text": text.strip().strip('"'),
        "category_ind": 1 if EUADR['target'][i] == 'gene_disease' else 0,
        "label": EUADR['target'][i]
    }

In [9]:
all_doc_ids = sorted(list(docs.keys()))
df = pd.DataFrame({"text": [docs[d]["text"] for d in all_doc_ids], 
                       "category_ind": [docs[d]['category_ind'] for d in all_doc_ids], 
                       "doc_id": [d for d in all_doc_ids],
                       "label": [docs[d]['label'] for d in all_doc_ids]
                        })

In [10]:
categories = list(df["label"].unique())
text_df = pd.DataFrame({"doc_id": df["doc_id"], "text": df["text"]})
truth_df = pd.DataFrame({"doc_id": df["doc_id"], "gt": df["label"]})
truth_dict = {str(rec["doc_id"]): rec["gt"] for rec in truth_df.to_dict(orient="records")}

In [11]:
doc_embeddings = sif_embedding_wrapper.sentences2vecs(df["text"], embs, words, weight4ind)
df["vector"] = pd.Series(list(doc_embeddings))

In [29]:
min_text_length = 10
skip_prediction = list(df[df["text"].map(len) < min_text_length].doc_id)
skip_prediction

[]

In [30]:
categories_ = []
# category_reps = {categories[0]: ['188'], categories[1]: ["585"]}
category_reps = {categories[0]: ['124'], categories[1]: ["259"]}

# drug_gene
# gene_disease

for k,v in category_reps.items():
    categories_.append(k)
    skip_prediction.extend(v)

In [31]:
docs = df
category_vecs = {}
for c in categories_:
    vectors = np.asarray(list(docs.loc[docs['doc_id'].isin(category_reps[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)


In [32]:
category_vecs = {}
for c in categories_:
    vectors = np.asarray(list(docs.loc[docs['doc_id'].isin(category_reps[c])].vector))
    category_vecs[c] = np.mean(vectors, axis=0)

In [33]:
predictions = {}

for idx, row in docs.iterrows():
    doc_id = row["doc_id"]
    if doc_id in skip_prediction:
        continue
    max_sim = 0
    winner = categories[0]
    for j in category_vecs:
        sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
        if sim > max_sim:
            max_sim = sim
            winner = j
    predictions[doc_id] = winner
    

In [86]:
from sklearn.metrics import f1_score

def get_accuracy_score(predictions, truth_dict):
    scores = []
    preds = []
    labels = []
    
    for k,v in predictions.items():
    
        preds.append(v)
        labels.append(truth_dict[k])

#         if v == truth_dict[k]:
#             scores.append(1)
#         else:
#             scores.append(0)
    
#     print(f1_score(labels, preds, average='weighted')) 
#     if len(scores) == 0:
#       return 0.0
#     return sum(scores) / float(len(scores))

    return f1_score(labels, preds, average='weighted')

get_accuracy_score(predictions, truth_dict)




0.7883843263553407

In [55]:
from itertools import cycle
from collections import deque
    
    
def infer_topics(docs, n_topics, min_text_length=80, max_iter=150, batch_size=128, learning_offset=300.):
    unclassifiable = list(docs[docs["text"].map(len) < min_text_length].doc_id)
    filtered = docs[~docs['doc_id'].isin(unclassifiable)]
    ids = [d for d in list(filtered.doc_id)[0:10]]
    n_features = 1000
    tf_vectorizer = TfidfVectorizer(
        stop_words='english',
        max_df=0.95,
        min_df=0.1,
        max_features=n_features)
    tf = tf_vectorizer.fit_transform(list(filtered.loc[:, 'text']))
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=max_iter,
        batch_size=batch_size,
        learning_method='online',
        learning_offset=learning_offset,
        random_state=0)
    lda.fit(tf)
    doc_topics = lda.transform(tf)
    topic_leaders = {"topic_{}".format(i): [] for i in iter(range(n_topics))}
    for idx, probs in enumerate(doc_topics):
        score = max(probs)
        topic = np.argmax(probs)

        doc_id = filtered.loc[filtered.index[idx]].doc_id
        topic_leaders["topic_{}".format(topic)].append(
            {"doc_id": doc_id, "score": score})
    for i in iter(range(n_topics)):
        topic_leaders["topic_{}".format(i)] = sorted(
            topic_leaders["topic_{}".format(i)], key=itemgetter('score'), reverse=True)

    def roundrobin(*iterables):
        q = deque(iter(it) for it in iterables)
        for itr in cycle(q):
            try:
                yield itr.__next__()
            except StopIteration:
                if len(q) > 0:
                    q.pop()
                else:
                    break

    return list(roundrobin(*topic_leaders.values()))

ordered_docs = infer_topics(df, 2)

In [25]:
ordered_docs

[{'doc_id': '120', 'score': 0.816831518758495},
 {'doc_id': '272', 'score': 0.8166679566169036},
 {'doc_id': '121', 'score': 0.816831518758495},
 {'doc_id': '549', 'score': 0.8166159896885092},
 {'doc_id': '137', 'score': 0.7927543300445525},
 {'doc_id': '599', 'score': 0.8165758595530275},
 {'doc_id': '124', 'score': 0.7927101495860739},
 {'doc_id': '456', 'score': 0.8164761838835681},
 {'doc_id': '125', 'score': 0.7927101495860739},
 {'doc_id': '602', 'score': 0.8164643767926444},
 {'doc_id': '128', 'score': 0.7927101495860739},
 {'doc_id': '261', 'score': 0.8164561185368904},
 {'doc_id': '8', 'score': 0.7927101495860739},
 {'doc_id': '262', 'score': 0.8164561185368904},
 {'doc_id': '191', 'score': 0.7927099035379565},
 {'doc_id': '571', 'score': 0.8164561185368904},
 {'doc_id': '196', 'score': 0.7927099035379565},
 {'doc_id': '393', 'score': 0.8124288015191476},
 {'doc_id': '198', 'score': 0.7927099035379565},
 {'doc_id': '394', 'score': 0.8124288015191476},
 {'doc_id': '544', 'scor

In [27]:
def auto_classify(docs, category_reps, min_text_length=80):
    # Exclude docs deemed too short to classify.
    skip_prediction = list(df[df["text"].map(len) < min_text_length].doc_id)
    categories = []
    for k,v in category_reps.items():
        categories.append(k)
        skip_prediction.extend(v) # No need to predict manually labeled docs
    category_vecs = {}
    for c in categories:
        vectors = np.asarray(list(docs.loc[docs['doc_id'].isin(category_reps[c])].vector))
        category_vecs[c] = np.mean(vectors, axis=0)

    predictions = {}
    for idx, row in docs.iterrows():
        doc_id = row["doc_id"]
        if doc_id in skip_prediction:
            continue
        max_sim = 0
        winner = categories[0]
        for j in category_vecs:
            sim = cosine_similarity(row["vector"].reshape(1, -1), category_vecs[j].reshape(1, -1)).flatten()[0]
            if sim > max_sim:
                max_sim = sim
                winner = j
        predictions[doc_id] = winner
    return predictions

In [80]:
representatives = {c:[] for c in categories_}
top_n = 24
ordered_ids = [d["doc_id"] for d in ordered_docs]
for doc_id in ordered_ids[:top_n]:
    gt = truth_dict[str(doc_id)]
    representatives[gt].append(doc_id)

    
  

values = [representatives[c] for c in categories]

doc_combs = list(itertools.product(*values))    

doc_combs

[('120', '272'),
 ('120', '549'),
 ('120', '599'),
 ('120', '456'),
 ('120', '602'),
 ('120', '261'),
 ('120', '262'),
 ('120', '571'),
 ('120', '393'),
 ('120', '394'),
 ('120', '544'),
 ('120', '396'),
 ('120', '545'),
 ('120', '397'),
 ('121', '272'),
 ('121', '549'),
 ('121', '599'),
 ('121', '456'),
 ('121', '602'),
 ('121', '261'),
 ('121', '262'),
 ('121', '571'),
 ('121', '393'),
 ('121', '394'),
 ('121', '544'),
 ('121', '396'),
 ('121', '545'),
 ('121', '397'),
 ('137', '272'),
 ('137', '549'),
 ('137', '599'),
 ('137', '456'),
 ('137', '602'),
 ('137', '261'),
 ('137', '262'),
 ('137', '571'),
 ('137', '393'),
 ('137', '394'),
 ('137', '544'),
 ('137', '396'),
 ('137', '545'),
 ('137', '397'),
 ('124', '272'),
 ('124', '549'),
 ('124', '599'),
 ('124', '456'),
 ('124', '602'),
 ('124', '261'),
 ('124', '262'),
 ('124', '571'),
 ('124', '393'),
 ('124', '394'),
 ('124', '544'),
 ('124', '396'),
 ('124', '545'),
 ('124', '397'),
 ('125', '272'),
 ('125', '549'),
 ('125', '599'

In [87]:
accuracies =[]
for comb in doc_combs:
    category_reps = {}
    for i,c in enumerate(categories):
        category_reps[c] = [str(comb[i])]

    preds = auto_classify(docs, category_reps)
    acc = get_accuracy_score(preds, truth_dict)
    accuracies.append(acc)

In [93]:
min(accuracies)

0.49973946789233414

In [56]:
def get_top_lda_combs(ordered_ids, docs_df, categories, truth_dict, top_n=12):
    representatives = {c:[] for c in categories}
    for doc_id in ordered_ids[:top_n]:
        gt = truth_dict[str(doc_id)]
        representatives[gt].append(doc_id)
    for c in categories:
        if len(representatives[c]) == 0:
            print("No representatives for %s" % c)
            return None
    values = [representatives[c] for c in categories]
    doc_combs = list(itertools.product(*values))
    return doc_combs

def get_lda_accuracies(categories, doc_combs, docs_df, truth_dict):
    accuracies = []
    for comb in doc_combs:
        category_reps = {}
        for i,c in enumerate(categories):
            category_reps[c] = [str(comb[i])]
        preds = auto_classify(docs_df, category_reps)
        acc = get_accuracy_score(preds, truth_dict)
        accuracies.append(acc)
    return accuracies


top_lda_combs = get_top_lda_combs([d["doc_id"] for d in ordered_docs], 
                                  df, categories, truth_dict)
lda_accs = get_lda_accuracies(categories, top_lda_combs, df, truth_dict)
max(lda_accs)

0.6415550371765205
0.6474428956429021
0.6828230583539661
0.6268168045945823
0.7485146276306445
0.7963745656053349
0.6415550371765205
0.6474428956429021
0.6828230583539661
0.6268168045945823
0.7485146276306445
0.7963745656053349
0.6074680753612387
0.670804130332261
0.6753159892120103
0.6653696920363588
0.799339422725612
0.6259914956029453
0.669253448240315
0.7058644323987284
0.708545569656681
0.7148602569634745
0.8368835044356866
0.6986510230072914
0.669253448240315
0.7058644323987284
0.708545569656681
0.7148602569634745
0.8368835044356866
0.6986510230072914
0.669253448240315
0.7058644323987284
0.708545569656681
0.7148602569634745
0.8368835044356866
0.6986510230072914


0.8358974358974359