This notebook shows how each module should be used. Each method is documented and explained. Be careful that all Models, Embeddings and Data should included or it may crash or give errors.<br>

Note that you do not need this notebook to use modules. This notebook is just for displaying the usage of them. To make it easier to understand.<br>
<br><br>
Index of this notebook:<br>
1- Hyperparameters and initial setup <br>
2- Loading embedding model<br>
3- Loading preprocessed data<br>
4- Training model (you can restore pre-trained model as well)<br>
5- Restoring pre-trained model<br>
6- Preparing system for for LRP<br>
7- LRP Scores for the example in 6.<br>
8- Removing important words according to TF-IDF (method 1)<br>
9- Removing words uniquely (method 2)<br>
10- Removing Sentences<br>
11- Offering keywords to users (Method 1, better and more personal recommendations)<br>
12- Offering keywords to users (Method 2, static method)<br>

In [1]:
import helper
import lrp
import EmbedHelper
import DataLoader
import Models
import remover
import utility
import keyword_recommender

import tensorflow as tf
import numpy as np
import pickle

  from ._conv import register_converters as _register_converters


{'Dermatology': 'Deri ve Zührevi Hastalıkları (Cildiye)', 'Internal Medicine': 'İç Hastalıkları (Dahiliye)', 'Neurology': 'Nöroloji', 'Obstetrics & Gynecology': 'Kadın Hastalıkları ve Doğum', 'Ophthalmology': 'Göz Hastalıkları', 'Orthopaedic Surgery': 'Ortopedi ve Travmatoloji', 'Otolaryngology': 'Kulak Burun Boğaz Hastalıkları', 'Pediatrics': 'Çocuk Sağlığı ve Hastalıkları', 'Psychiatry': 'Ruh Sağlığı ve Hastalıkları', 'Radiology-Diagnostic': 'Radyoloji', 'Surgery-General': 'Genel Cerrahi', 'Urology': 'Üroloji'}


## 1- Hyperparameters and initial setup

In [2]:
embedDict = EmbedHelper.EmbeddingHandler.embedDict
print(embedDict)
configs = {
    "vectorSize":300,
    "trainNewModel":True,
    "dataColumn":"question",
    "maxLength":128,
    "batchSize":8,
    "embeddingType":embedDict[2],
    "ELMo":True,
    "PreEmbed":True,
    "restore":True
}

inputSize = configs["maxLength"]
vectorSize = configs["vectorSize"]

{1: 'Fast Text', 2: 'Google News', 3: 'HealthTap', 4: 'Pubmed', 5: 'Glove', 6: 'iCliniq Trigram', 7: 'iCliniq default'}


## 2- Loading embedding model

In [3]:
# Load embedding model
EmbedModel = EmbedHelper.EmbeddingHandler(configs["embeddingType"], False, 300, "Embeddings")

Loading Google News


## 3- Loading preprocessed data

In [4]:
# Load iCliniq Data
trainData = np.load("data//icliniq//iCliniq_14K//icliniq_14k_train_questions.npy")
trainTarget = np.load("data//icliniq//iCliniq_14K//icliniq_14k_train_target.npy")
testData = np.load("data//icliniq//iCliniq_14K//icliniq_14k_test_questions.npy")
testTarget = np.load("data//icliniq//iCliniq_14K//icliniq_14k_test_target.npy")

trainData_raw = np.load("data//icliniq//iCliniq_14K//icliniq_14k_train_questions_raw.npy")
testData_raw = np.load("data//icliniq//iCliniq_14K//icliniq_14k_test_questions_raw.npy")

## 4- Training model (you can restore pre-trained model as well)

In [5]:
# Test just for 32 iterations (for example purposes).
sess, nnModel = helper.execute_training(False, EmbedModel, 32, trainData, trainTarget, testData, testTarget, configs, model_path=None)

fullvectorsize:  300
(?, 126, 1, 250)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Model Created.
trainData shape :  (7903, 128)
testData shape :  (1975, 128)
trainTarget shape :  (7903,)
testTarget shape :  (1975,)

[Current iteration = 20 Train Acc:0.5 HT Test Acc:0 fold0Test: (0) ucAcc :0 dataRatio  :0 ]

# 5- Restoring pre-trained model

In [6]:
model_path = "NNModels/icliniq14k_GoogleNews_onelayer_pad128/model.ckpt"
outputSize = 12
nnModel = Models.CNN(inputSize=inputSize, vectorSize=vectorSize, outputSize=outputSize)

sess = tf.InteractiveSession(graph=nnModel.paperGraph)
tf.global_variables_initializer().run()
sess.run(tf.tables_initializer())

tf.train.Saver().restore(sess, model_path)

fullvectorsize:  300
(?, 126, 1, 250)




INFO:tensorflow:Restoring parameters from NNModels/icliniq14k_GoogleNews_onelayer_pad128/model.ckpt


## 6- Preparing system for for LRP

We need a session and a NNModel to use LRP.I used the ones came after training model for example purposes


In [7]:
# To test LRP
ClassDict = {}
with open('fold0classDict.pkl', 'rb') as f:
    ClassDict = pickle.load(f)
outputSize = len(ClassDict)

batch_x = trainData[0:21]
batch_y = trainTarget[0:21]
batch_x = EmbedModel.vectorizeBatch(batch_x)
batch_y = sess.run(tf.one_hot(batch_y,outputSize)) 

alpha = 1
layer_count = 1

# Executing LRP
weights, biases, activations = helper.get_weights_biases_acts(layer_count, nnModel)
backprop_layers = lrp.lrp_layers(alpha, layer_count, activations, weights, biases)
word_relevances, results_combined = lrp.get_word_relevances(alpha, backprop_layers, layer_count, batch_x[0:1], trainData[0], sess, nnModel, activations, weights, biases)

In [8]:
trainData[0]

array(['hello', 'doctor', 'i', 'have', 'burning', 'sensation', 'while',
       'urinating', 'and', 'a', 'frequent', 'urge', 'to', 'urinate',
       'can', 'it', 'be', 'due', 'to', 'sexual', 'contact', 'i', 'am',
       'a', 'year', 'old', 'male', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None]', '[None]', '[None]',
       '[None]', '[None]', '[None]', '[None

## 7- LRP Scores for each word (for example above)

In [13]:
word_relevances

[('hello', -0.00826831935576656),
 ('doctor', -0.1080422055646184),
 ('i', 0.020342834972284517),
 ('have', -0.3017208268602165),
 ('burning', 0.44691333996067906),
 ('sensation', -0.0056952146834523204),
 ('while', -0.006010369613295131),
 ('urinating', -0.016021687008595818),
 ('and', 0.0),
 ('a', 0.0),
 ('frequent', 0.1610667028217684),
 ('urge', -0.012891035401162513),
 ('to', 0.0),
 ('urinate', -0.7028770827343872),
 ('can', 0.04179418528419647),
 ('it', 0.09591374834138497),
 ('be', 0.24346324413630724),
 ('due', -0.1294584272151936),
 ('to', 0.0),
 ('sexual', -0.08989809793069395),
 ('contact', -0.27382501712822876),
 ('i', 0.02026339918725208),
 ('am', -0.007761515742923862),
 ('a', 0.0),
 ('year', 0.001163108813086248),
 ('old', -0.002362008131448808),
 ('male', -0.0780253962949429),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0)

# 8- Removing important words according to TF-IDF (method 1)

This method remove all occurrences of important words. For example, if remove_top_x_words is 1, then this method finds all occurrences of most important 1 word, lets assume the word is "skin", and removes all occurrences of "skin" from the data instance. So it may remove more than one words even if remove_top_x_words is 1.

In [14]:
word_imps_all_classes = remover.get_word_imps_all_classes("data//icliniq//iCliniq_14K//tfidf_results//")

In [15]:
# Remove most important 2 words
confidence = 0.1 # not important here
remove_top_x_words = 2
removed_data = remover.remove_x_tfidf(remove_top_x_words, testData, testTarget, word_imps_all_classes)

total_words_removed 1063


In [16]:
results = helper.evaluatePerformance(configs, nnModel, EmbedModel, sess, removed_data, testTarget, 1, 1-confidence)
results["Accuracy"] # accuracy after removing important words

0.7053164556962025

# 9- Removing words uniquely (method 2)

In contrast to previous method, this method only removes specified number of words from data. For example, if the most important 2 words are "skin" and "hair", then this method removes only "skin" and "hair" in the specified index, and does not remove other "skin" or "hair" words from the data instance. This is because even same words in different indexes may have different importance values.

You can either restore previously saved list, or create a new one (takes some time).

In [17]:
# Restore previously saved word_imps
word_imps_unsorted = np.load("data//icliniq//iCliniq_14K//word_importances_unsorted.npy")

In [18]:
# Or generate new, which takes a while
should_save = True
save_path = "data//icliniq//iCliniq_14K//word_importances_unsorted"
word_imps_unsorted = remover.get_word_importances(testData, EmbedModel, nnModel, sess, should_save, save_path)
np.save("data//icliniq//iCliniq_14K//word_importances_unsorted", word_imps_unsorted)

We want unsorted word importances list because we need indexes according to data. We sort after we add indexes.

In [18]:
word_imps_with_indexes = remover.prepare_word_imps(word_imps_unsorted, testData)
removed_data = remover.remove_words_with_index(5, testData, word_imps_with_indexes)
removed_data = np.array(removed_data)

results = helper.evaluatePerformance(configs, nnModel, EmbedModel, sess, removed_data, testTarget, 1, 1-confidence)
results["Accuracy"] # accuracy after removing important "exacty" 5 words from each data instance.

0.7048101265822785

# 10- Removing Sentences

Just like method 1 of removing words, this method may remove more than one sentences for each important word. For example if remove_top_x_words is 1, and lets assume our most important word is "hair", then this method removes all sentences which includes the word "hair" in current data instance.

In [19]:
# important words for classes according to tfidf 
word_imps_all_classes = remover.get_word_imps_all_classes("data//icliniq//iCliniq_14K//tfidf_results//")

In [21]:
remove_top_x_words = 2

sentences_removed_data, indexes = remover.remove_important_sentences(remove_top_x_words, False, testData_raw, testTarget, word_imps_all_classes, 128)
# sentence_removed_data results all instances
results = helper.evaluatePerformance(configs, nnModel, EmbedModel, sess, np.array(sentences_removed_data), testTarget, 1, 1-confidence)
results["Accuracy"] # accuracy after removing sentences according to top 2 words

Completed 100.00 % %
deleted_sentence_count 648


0.7154430379746836

# 11- Offering keywords to users (Method 1, better and more personal recommendations)

In [22]:
description = np.load("data//symcat//symcat_data.npy")
symptoms = np.load("data//symcat//symcat_categories.npy")
desc_plus_symptoms = np.load("data//symcat//symcat_data_plus_cats.npy")
softmax_results_symcat = np.load("data//symcat//softmax_results_symcat.npy")


# contains similarities for all classes with all symptoms
# for example, cosine_sim_indexes_all_classes[0] is Dermatology (ClassDict)
# cosine_sim_indexes_all_classes[0][0] is the index most relevant symptom in symptoms array
cosine_sim_indexes_all_classes = np.load("data//symcat//cosine_sim_indexes_all_classes.npy")

In [23]:
raw_user_input = "hello doctor i have been struggling with acne for a while now and have been postponing going to my general physician i have been very hesitant on what to use and do not ever apply anything apart from a light cleanser on my face i am not even sure what the term would be for it except the loose term rosacea i am looking for advice on what prescription or non prescription medication would be safe and effective for my acne with concerns of my nose getting worse i also have very pale skin" # example input
confidence = 0.9
user_input = DataLoader.DataHandler.cleanTextData(raw_user_input)
user_input = np.array(DataLoader.DataHandler.textIntoWordList(user_input, 128)[0])
#user input lrpye verilmeye hazır
# user_input = process_user_input([raw_user_input])
all_results = helper.evaluatePerformance(configs, nnModel, EmbedModel, sess, user_input, [0], 1, 1-confidence)

user_result = all_results["Scores"][0] # we will check similarity between user_result and softmax_results_symcat

In [24]:
# get most relevant symptom and ask this to user
keyword_recommender.get_symptom_input_similarity(0, symptoms, softmax_results_symcat, all_results, raw_user_input)

'cloudy eye'

# 12- Offering keywords to users (Method 2, static method)

In [25]:
 # example input
raw_user_input = "my hair is transparent"

user_input = keyword_recommender.process_user_input([raw_user_input])
all_results = helper.evaluatePerformance(configs, nnModel, EmbedModel, sess, user_input, [0], 1, 1-confidence)

In [26]:
print("personal: ", keyword_recommender.get_symptom_input_similarity(0, symptoms, softmax_results_symcat, all_results, raw_user_input))
print("personal: ", keyword_recommender.get_symptom_input_similarity(1, symptoms, softmax_results_symcat,all_results, raw_user_input))
print("personal: ", keyword_recommender.get_symptom_input_similarity(2, symptoms, softmax_results_symcat, all_results, raw_user_input))
print("static: ", keyword_recommender.get_next_symptom_top3class(0, 0, symptoms, cosine_sim_indexes_all_classes, all_results, raw_user_input))
print("static: ", keyword_recommender.get_next_symptom_top3class(1, 0, symptoms, cosine_sim_indexes_all_classes, all_results, raw_user_input))
print("static: ", keyword_recommender.get_next_symptom_top3class(0, 1, symptoms, cosine_sim_indexes_all_classes, all_results, raw_user_input))
print("static: ", keyword_recommender.get_next_symptom_top3class(1, 1, symptoms, cosine_sim_indexes_all_classes, all_results, raw_user_input))
print("static: ", keyword_recommender.get_next_symptom_top3class(0, 2, symptoms, cosine_sim_indexes_all_classes, all_results, raw_user_input))
print("static: ", keyword_recommender.get_next_symptom_top3class(1, 2, symptoms, cosine_sim_indexes_all_classes, all_results, raw_user_input))

personal:  blood in urine
personal:  vulvar symptoms
personal:  lump or mass of breast
static:  skin irritation
static:  mouth symptoms
static:  problems with lymph nodes (glands)
static:  sore lymph nodes
static:  symptoms of eye
static:  visual disturbance
