In [1]:
import xlrd
import nltk
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim import models
from operator import itemgetter
from gensim import similarities

In [2]:
# Read the three pages sepparately
path = "loinc_dataset-v2.xlsx"
document = xlrd.open_workbook(path)
query1 = document.sheet_by_index(0)
query2 = document.sheet_by_index(1)
query3 = document.sheet_by_index(2)

In [3]:
# Store the loinc_num and the long_name in a dictionary of the three querys
# Glucose in blood is taken into account
data = {}
rows_query1 = query1.nrows
# String of the first query that is glucose in blood
part_list = query1.cell_value(0,0).split(":")[1].split(" ")[1:4]
text_query1 = " ".join(part_list).lower()
data[text_query1] = {}
loinc_num = query1.cell_value(2,0)
long_name = query1.cell_value(2,1)
data[text_query1][loinc_num] = []
data[text_query1][long_name] = []
for i in range(3, rows_query1):
    number_loinc = query1.cell_value(i,0)
    text_name = query1.cell_value(i,1)
    data[text_query1][loinc_num].append(number_loinc)
    data[text_query1][long_name].append(text_name)

In [4]:
# bilirubin in plasma
rows_query2 = query2.nrows
part_list = query2.cell_value(0,0).split(":")[1].split(" ")[1:4]
text_query2 = " ".join(part_list).lower()
data[text_query2] = {}
loinc_num = query2.cell_value(2,0)
long_name = query2.cell_value(2,1)
data[text_query2][loinc_num] = []
data[text_query2][long_name] = []
for i in range(3, rows_query2):
    number_loinc = query2.cell_value(i,0)
    text_name = query2.cell_value(i,1)
    data[text_query2][loinc_num].append(number_loinc)
    data[text_query2][long_name].append(text_name)

In [5]:
# White blood cells count
rows_query3 = query3.nrows
part_list = query3.cell_value(0,0).split(":")[1].split(" ")[1:4]
text_query3 = " ".join(part_list).lower()
data[text_query3] = {}
loinc_num = query3.cell_value(2,0)
long_name = query3.cell_value(2,1)
data[text_query3][loinc_num] = []
data[text_query3][long_name] = []
for i in range(3, rows_query3):
    number_loinc = query3.cell_value(i,0)
    text_name = query3.cell_value(i,1)
    data[text_query3][loinc_num].append(number_loinc)
    data[text_query3][long_name].append(text_name)

In [6]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()# object allowing a stemmer to return words in root form
    tokens = wordpunct_tokenize(doc) # tokenize docs 
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

In [7]:
def create_dictionary(docs):
    pdocs = [preprocess_document(doc) for doc in docs]
    dictionary = corpora.Dictionary(pdocs)
    dictionary.save('/tmp/vsm.dict')
    return dictionary

In [8]:
# Create the bag of words-based representation for each long name in the list
def docs2bows(corpus, dictionary):
    docs = [preprocess_document(d) for d in corpus]
    vectors = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize('/tmp/vsm_docs.mm', vectors)
    return vectors

In [9]:
# TF-IDF weghted counterparts
def create_TF_IDF_model(corpus):
    dictionary = create_dictionary(corpus)
    docs2bows(corpus, dictionary)
    loaded_corpus = corpora.MmCorpus('/tmp/vsm_docs.mm')
    tfidf = models.TfidfModel(loaded_corpus)
    return tfidf, dictionary

In [10]:
def launch_query(corpus, q, filename='/tmp/vsm_docs.mm'):
    sorted_scores = []
    sorted_docs = []
    tfidf, dictionary = create_TF_IDF_model(corpus)
    loaded_corpus = corpora.MmCorpus(filename)
    index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
    pq = preprocess_document(q)
    vq = dictionary.doc2bow(pq)
    qtfidf = tfidf[vq]
    sim = index[qtfidf]
    ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
    for doc, score in ranking:
        if score > 0.0:
            sorted_scores.append(score)
            sorted_docs.append(corpus[doc])
            # print("[ Score = " + "%f" % score + " ] " + corpus[doc]);
    return sorted_scores, sorted_docs

In [11]:
print(text_query3)
sorted_scores, sorted_docs = launch_query(data[text_query3][long_name], text_query3)

white blood cells


In [12]:
# file = open(r"query3.txt","w+")
# position = 1
# for i in range(len(sorted_docs)):
    # row = str(sorted_docs[i]) + "$" + str(position) + "$" + str(sorted_scores[i]) + "$" + "3" + "\n"
    # file.write(row)
    # position += 1
# file.close()

In [13]:
# Dataset already created in the directory dataset
from adarank import AdaRank
from metric import NDCGScorer_qid
import numpy as np
from sklearn.model_selection import train_test_split

X = []
y = []
qid = []
i = 1
with open("train.txt") as fp:
    for cnt, line in enumerate(fp):
        X.append(float(line.split("$")[1]))
        y.append(float(line.split("$")[2]))
        qid.append(float(line.split("$")[3]))
        i += 1
X = np.array(X)
X = X.reshape(X.shape[0],1)
y = np.array(y)
y = y.reshape(y.shape[0],1)
qid = np.array(qid)
qid = qid.reshape(qid.shape[0],1)
X_train, X_test, y_train, y_test, qid_train, qid_test = train_test_split(X, y, qid, test_size=0.2, random_state=42)

In [14]:
X_train, X_valid, y_train, y_valid, qid_train, qid_valid = train_test_split(X_train, y_train, qid_train, test_size=0.2, random_state=42)
model = AdaRank(scorer=NDCGScorer_qid(K=5))
model.fit(X, y.ravel(), qid.ravel(), X_valid, y_valid.ravel(), qid_valid.ravel())
pred = model.predict(X_test)
print(pred)
print(NDCGScorer_qid(K=5)(y_test,pred,qid_test).mean())

[14.06312482 19.68837475 30.93887461  2.81262496 36.56412454 14.06312482
 16.87574979 36.56412454 36.56412454 22.50099972 33.75149957 56.25249929
 75.94087404 53.43987432 28.12624964]
1.0
