In [1]:
import xlrd
import nltk
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim import models
from operator import itemgetter
from gensim import similarities
import numpy as np

## Tf-idf model
This model has been used in order to orientate us when making the labelling of the dataset. Specially for the long_common_name feature the model has been very usefull for ranking each document. For the adaRank model it is not necessary to run all this functions.

In [None]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()# object allowing a stemmer to return words in root form
    tokens = wordpunct_tokenize(doc) # tokenize docs 
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

In [None]:
def create_dictionary(docs):
    pdocs = [preprocess_document(doc) for doc in docs]
    dictionary = corpora.Dictionary(pdocs)
    dictionary.save('/tmp/vsm.dict')
    return dictionary

In [None]:
# Create the bag of words-based representation for each long name in the list
def docs2bows(corpus, dictionary):
    docs = [preprocess_document(d) for d in corpus]
    vectors = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize('/tmp/vsm_docs.mm', vectors)
    return vectors

In [None]:
# TF-IDF weghted counterparts
def create_TF_IDF_model(corpus):
    dictionary = create_dictionary(corpus)
    docs2bows(corpus, dictionary)
    loaded_corpus = corpora.MmCorpus('/tmp/vsm_docs.mm')
    tfidf = models.TfidfModel(loaded_corpus)
    return tfidf, dictionary

In [None]:
def launch_query(corpus, q, filename='/tmp/vsm_docs.mm'):
    sorted_scores = []
    sorted_docs = []
    tfidf, dictionary = create_TF_IDF_model(corpus)
    loaded_corpus = corpora.MmCorpus(filename)
    index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
    pq = preprocess_document(q)
    vq = dictionary.doc2bow(pq)
    qtfidf = tfidf[vq]
    sim = index[qtfidf]
    ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
    for doc, score in ranking:
        if score > 0.0:
            sorted_scores.append(score)
            sorted_docs.append(corpus[doc])
            print("[ Score = " + "%f" % score + " ] " + corpus[doc]);
    return sorted_scores, sorted_docs

In [None]:
# If you want to try how the tf-idf model works on the long_common_name feature for the three csv pages:
# (text_query1, text_query2, text_query3) the variables must be changed before the execution
print(text_query3)
sorted_scores, sorted_docs = launch_query(data[text_query3][long_name], text_query3)

## adaRank model
We read the labelled data, all the data is normalized and inverted for a correct values.

In [2]:
# Read the three pages sepparately
path = "loinc_dataset-v2.xlsx"
document = xlrd.open_workbook(path)
query1 = document.sheet_by_index(0)
query2 = document.sheet_by_index(1)
query3 = document.sheet_by_index(2)

In [3]:
# Glucose in blood is taken into account
rows_query1 = query1.nrows
# String of the first query that is glucose in blood
part_list = query1.cell_value(0,0).split(":")[1].split(" ")[1:4]
text_query1 = " ".join(part_list).lower()
loinc_num = []
long_name = []
component = []
system = []
properti = []
qid = []
label_ranking = []
for i in range(3, rows_query1):
    loinc_num.append(query1.cell_value(i,0))
    long_name.append(query1.cell_value(i,1))
    component.append(query1.cell_value(i,2))
    system.append(query1.cell_value(i,3))
    properti.append(query1.cell_value(i,4))
    qid.append(query1.cell_value(i,5))
    label_ranking.append(query1.cell_value(i,6))
max_long = max(long_name)
long_name = np.array([1-(nam/max_long) for nam in long_name])
max_component = max(component)
component = np.array([1-(comp/max_component) for comp in component])
max_system = max(system)
system = np.array([1-(sys/max_system) for sys in system])
max_properti = max(properti)
properti = np.array([1-(prop/max_properti) for prop in properti])
qid = np.array(qid)
max_label = max(label_ranking)
label_ranking = np.array([1-(lab/max_label) for lab in label_ranking])
row = label_ranking.shape
data1 = np.zeros((row[0],6))
data1[:,0] = long_name
data1[:,1] = component
data1[:,2] = system
data1[:,3] = properti
data1[:,4] = qid
data1[:,5] = label_ranking
print(data1.shape)

(67, 6)


In [4]:
# bilirubin in plasma
rows_query2 = query2.nrows
part_list = query2.cell_value(0,0).split(":")[1].split(" ")[1:4]
text_query2 = " ".join(part_list).lower()
loinc_num = []
long_name = []
component = []
system = []
properti = []
qid = []
label_ranking = []
for i in range(3, rows_query2):
    loinc_num.append(query2.cell_value(i,0))
    long_name.append(query2.cell_value(i,1))
    component.append(query2.cell_value(i,2))
    system.append(query2.cell_value(i,3))
    properti.append(query2.cell_value(i,4))
    qid.append(query2.cell_value(i,5))
    label_ranking.append(query2.cell_value(i,6))
max_long = max(long_name)
long_name = np.array([1-(nam/max_long) for nam in long_name])
max_component = max(component)
component = np.array([1-(comp/max_component) for comp in component])
max_system = max(system)
system = np.array([1-(sys/max_system) for sys in system])
max_properti = max(properti)
properti = np.array([1-(prop/max_properti) for prop in properti])
qid = np.array(qid)
max_label = max(label_ranking)
label_ranking = np.array([1-(lab/max_label) for lab in label_ranking])
row = label_ranking.shape
data2 = np.zeros((row[0],6))
data2[:,0] = long_name
data2[:,1] = component
data2[:,2] = system
data2[:,3] = properti
data2[:,4] = qid
data2[:,5] = label_ranking
print(data2.shape)

(67, 6)


In [5]:
# White blood cells count
rows_query3 = query3.nrows
part_list = query3.cell_value(0,0).split(":")[1].split(" ")[1:4]
text_query3 = " ".join(part_list).lower()
loinc_num = []
long_name = []
component = []
system = []
properti = []
qid = []
label_ranking = []
for i in range(3, rows_query3):
    loinc_num.append(query3.cell_value(i,0))
    long_name.append(query3.cell_value(i,1))
    component.append(query3.cell_value(i,2))
    system.append(query3.cell_value(i,3))
    properti.append(query3.cell_value(i,4))
    qid.append(query3.cell_value(i,5))
    label_ranking.append(query3.cell_value(i,6))
max_long = max(long_name)
long_name = np.array([1-(nam/max_long) for nam in long_name])
max_component = max(component)
component = np.array([1-(comp/max_component) for comp in component])
max_system = max(system)
system = np.array([1-(sys/max_system) for sys in system])
max_properti = max(properti)
properti = np.array([1-(prop/max_properti) for prop in properti])
qid = np.array(qid)
max_label = max(label_ranking)
label_ranking = np.array([1-(lab/max_label) for lab in label_ranking])
row = label_ranking.shape
data3 = np.zeros((row[0],6))
data3[:,0] = long_name
data3[:,1] = component
data3[:,2] = system
data3[:,3] = properti
data3[:,4] = qid
data3[:,5] = label_ranking
print(data3.shape)

(67, 6)


In [6]:
# Concatenate the three arrays
final_data = np.concatenate((data1, data2, data3), axis=0)
print(final_data.shape)

(201, 6)


In [7]:
from adarank import AdaRank
from metric import NDCGScorer_qid
import numpy as np
from sklearn.model_selection import train_test_split

X = final_data[:,:4]
y = final_data[:,5].ravel()
qid = final_data[:,4].ravel()
X_train, X_test, y_train, y_test, qid_train, qid_test = train_test_split(X, y, qid, test_size=0.2, random_state=42)

In [8]:
# X_train, X_valid, y_train, y_valid, qid_train, qid_valid = train_test_split(X_train, y_train, qid_train, test_size=0.2, random_state=42)
model = AdaRank(scorer=NDCGScorer_qid(K=5))
model.fit(X, y, qid, X, y, qid)
pred = model.predict(X_test)
#print(pred)
accuracy = NDCGScorer_qid(K=5)(y_test,pred,qid_test).mean()
print("Accuracy for the adaRank model:",accuracy)
print("Accuracy percentage for the adaRank model:",accuracy*100,"%")

Accuracy for the adaRank model: 0.9258429746159355
Accuracy percentage for the adaRank model: 92.58429746159355 %
