In [1]:
import numpy as np
import tensorflow as tf
import pandas
import pickle
import os

  from ._conv import register_converters as _register_converters


In [2]:
# Our code
import lrp
import EmbedHelper
import DataLoader
import Models



In [3]:
embedDict = EmbedHelper.EmbeddingHandler.embedDict
print(embedDict)
configs = {
    "vectorSize":300,
    "trainNewModel":True,
    "dataColumn":"question",
    "maxLength":128,
    "batchSize":8,
    "embeddingType":embedDict[2],
    "ELMo":True,
    "PreEmbed":True,
    "restore":True
}

inputSize = configs["maxLength"]
vectorSize = configs["vectorSize"]

{1: 'Fast Text', 2: 'Google News', 3: 'HealthTap', 4: 'Pubmed', 5: 'Glove', 6: 'iCliniq Trigram', 7: 'iCliniq default'}


In [4]:
EmbedModel = EmbedHelper.EmbeddingHandler(configs["embeddingType"], False, 300, "Embeddings")

Loading Google News


# iCliniq Data (for training, testing model, and LRP)

In [19]:
# new data that is larger with 9800~ data instances
trainData = np.load("data//icliniq//iCliniq_14K//icliniq_14k_train_questions.npy")
trainTarget = np.load("data//icliniq//iCliniq_14K//icliniq_14k_train_target.npy")
testData = np.load("data//icliniq//iCliniq_14K//icliniq_14k_test_questions.npy")
testTarget = np.load("data//icliniq//iCliniq_14K//icliniq_14k_test_target.npy")

trainData_raw = np.load("data//icliniq//iCliniq_14K//icliniq_14k_train_questions_raw.npy")
testData_raw = np.load("data//icliniq//iCliniq_14K//icliniq_14k_test_questions_raw.npy")

ClassDict = {}
with open('fold0classDict.pkl', 'rb') as f:
    ClassDict = pickle.load(f)
outputSize = len(ClassDict)

# Symcat Data (used for asking keywords)


In [44]:
description = np.load("data//symcat//symcat_data.npy")
symptoms = np.load("data//symcat//symcat_categories.npy")
desc_plus_symptoms = np.load("data//symcat//symcat_data_plus_cats.npy")
softmax_results_symcat = np.load("data//symcat//softmax_results_symcat.npy")

In [375]:
# This cell is only used if we want to get softmax output for symcat data
# However, I already did that and saved as npy file

# fake_labels = np.array(["Dermatology"] * len(symptoms))

# raw_symcat_data = []
# for sentence in desc_plus_symptoms:
#     tmp = ""
#     for word in sentence:
#         tmp += word + " "
    
#     raw_symcat_data.append(tmp[0:-1])
    
# symcat_data_stacked = np.hstack((np.array(raw_symcat_data).reshape(-1,1), fake_labels.reshape(-1,1)))
# testData,testTarget,_ = DataLoader.DataHandler.masterPreprocessor(symcat_data_stacked,shuffle=False,classDict=fold0ClassDict,maxLength=configs["maxLength"])

# Train-Test

In [5]:
def getTokenLengths(token):
    return [len(item) for item in token]

In [6]:
def evaluatePerformance(nnModel,sess,testData,testTarget,batchSize,uncertaintyCoef):
    reverseClassDict = {value:key for key,value in ClassDict.items()}
    top3 = []
    
    dataSize = testData.shape[0]
    start = 0
    end = batchSize
    
    totalAcc = 0
    totalUcAcc = 0
    totalDataRate = 0
    
    truth = None
    predu = None
    
    testTruth = np.array([])
    testPred = np.array([])
    testScores = []
    
    testEvTrue = 0
    testEvFail = 0
    
    while(start<dataSize):
        data = np.array(testData[start:end])
        dataClean = data
        
        if(configs["PreEmbed"]):
            data = EmbedModel.vectorizeBatch(data)
        
        outputData = np.array(testTarget[start:end])
        cutSize = data.shape[0]
        tokens_length = getTokenLengths(data)
        
        fd = {nnModel.nn_inputs:dataClean,nnModel.nn_vector_inputs:data,nnModel.nn_outputs:outputData,nnModel.isTraining:False,nnModel.token_lengths:tokens_length,
             nnModel.uncertaintyRatio:uncertaintyCoef}
        
        scores, prob, testBAcc,nnTruth,nnPrediction,nnMatch,evCor,evFail,ucAcc,dataRate = sess.run([nnModel.scores, nnModel.prob, nnModel.accuracy,nnModel.truths,nnModel.predictions
                                                                       ,nnModel.correct_predictions,nnModel.mean_ev_succ,nnModel.mean_ev_fail,nnModel.ucAccuracy,
                                                                                     nnModel.dataRatio]
                                                                      ,feed_dict=fd)
        # For top 3
        prob = prob[0]
        probDict = {reverseClassDict[i]:prob[i] for i in np.arange(outputSize)}
        probMatrix = []
        for i in range(len(prob)):
            probMatrix.append([reverseClassDict[i], prob[i]])
        probMatrix = sorted(probMatrix, key=lambda x: (x[1]), reverse=True)
        top3.append(probMatrix[0:3])
        
        testTruth = np.append(testTruth,nnTruth,axis=0)
        testPred = np.append(testPred,nnPrediction,axis=0)
#         testScores = np.append(testScores, scores, axis=0)
        testScores.append(scores)
        testEvTrue += evCor*cutSize
        testEvFail += evFail*cutSize 
        
        totalAcc += testBAcc*cutSize
        totalUcAcc += ucAcc*cutSize
        totalDataRate += dataRate*cutSize
        start += batchSize
        end += batchSize
        
    outputs = {
        "Accuracy":totalAcc/dataSize,
        "TotalEvidenceTrue":testEvTrue/dataSize,
        "TotalEvidenceFalse":testEvFail/dataSize,
        "UncertaintyAccuracy":totalUcAcc/dataSize,
        "DataRate":totalDataRate/dataSize,
        "Truth":testTruth,
        "Prediction":testPred,
        "Scores":testScores,
        "Top3":top3
    }
        
    return outputs
    #return (totalAcc/dataSize,testTruth,testPred,testEvTrue/dataSize,testEvFail/dataSize,totalUcAcc/dataSize,totalDataRate/dataSize)

In [7]:
def trainModel(nnModel, iterations, trainData, trainTarget, testData, testTarget, configs, accList):
    batcher = DataLoader.DataHandler.batchIterator(trainData, trainTarget, configs["batchSize"])
    sample,_ = next(batcher)
    
    print("trainData shape : ", trainData.shape)
    print("testData shape : ", testData.shape)
    print("trainTarget shape : ", trainTarget.shape)
    print("testTarget shape : ", testTarget.shape)
    
    htTestAcc=0
    fold0TestAcc = 0
    ucAcc = 0
    dataRate = 0
    
    L_test_ev_s=[]
    L_test_ev_f=[]
    
    print("")
    for i in range(iterations):
        data, target = next(batcher)
        dataClean = data

        if(configs["PreEmbed"]):
            data = EmbedModel.vectorizeBatch(data)

        tokens_length = getTokenLengths(data)
        fd = {nnModel.nn_inputs:dataClean, nnModel.nn_vector_inputs:data,nnModel.nn_outputs:target,
              nnModel.isTraining:True,nnModel.token_lengths:tokens_length,nnModel.annealing_step:0.00005*i}
        _, acc, los = sess.run([nnModel.train_op,nnModel.accuracy,nnModel.loss],feed_dict=fd)

        if(i%20==0):
            title = ("[Current iteration = "+str(i)+" Train Acc:"+str(acc)+" HT Test Acc:"+str(htTestAcc)+" fold0Test: ("+str(fold0TestAcc)+') ucAcc :'+str(ucAcc)
                +" dataRatio  :"+str(dataRate)+' ]')
            title = str(title)       
            print(title, end="\r")

        if(i%50000==0 and i != 0):
            oldTestAcc = fold0TestAcc               
            testOutputs = evaluatePerformance(nnModel, sess, testData, testTarget, configs["batchSize"], 0.1)  
            
            fold0TestAcc = testOutputs["Accuracy"]
            fEvTrue = testOutputs["TotalEvidenceTrue"]
            fEvFail = testOutputs["TotalEvidenceFalse"]
            ucAcc = testOutputs["UncertaintyAccuracy"]
            dataRate = testOutputs["DataRate"]
            fTruth = testOutputs["Truth"]
            fPrediction = testOutputs["Prediction"]
            
            confidences = [0.995,0.98,0.90,0.70,0.5]
            confidenceMatrix = np.zeros(shape=[len(confidences),3])
            for idx in range(len(confidences)):
                testOutputs = evaluatePerformance(nnModel, sess, testData, testTarget, configs["batchSize"],1-confidences[idx])
                confidenceMatrix[idx,0] = confidences[idx]
                confidenceMatrix[idx,1] = testOutputs["DataRate"]
                confidenceMatrix[idx,2] = testOutputs["UncertaintyAccuracy"]
            
            L_test_ev_s.append(fEvTrue)
            L_test_ev_f.append(fEvFail)
            
            if(fold0TestAcc>oldTestAcc):
                pass
                #saveSession(sess)

            accList.append([i, acc, htTestAcc, fold0TestAcc, los, ucAcc])
            npAccList = np.array(accList)           

In [15]:
outputSize = 12

In [16]:
should_load = True
model_path = "NNModels/icliniq14k_GoogleNews_onelayer_pad128/model.ckpt"

configs["maxLength"] = 128 
inputSize = configs["maxLength"]
configs["batchSize"] = 8
# ORIGINAL PART
nnModel = Models.PyramidCNNVShort(inputSize=inputSize, vectorSize=vectorSize, outputSize=outputSize)

# MY PART
# nnModel = Models.myModel_CNN_TEXT(inputSize=inputSize, vectorSize=vectorSize, outputSize=outputSize)

sess = tf.InteractiveSession(graph=nnModel.paperGraph)
tf.global_variables_initializer().run()
sess.run(tf.tables_initializer())

if should_load:
    tf.train.Saver().restore(sess, model_path)

fullvectorsize:  300
(?, 126, 1, 250)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
INFO:tensorflow:Restoring parameters from NNModels/icliniq14k_GoogleNews_onelayer_pad128/model.ckpt


In [41]:
# Evaluate
confidence = 0.9
results = evaluatePerformance(nnModel, sess, testData, testTarget, 1, 1-confidence)
results;

# Summary org

In [150]:
def get_difference(list1, list2, truth):
    same = 0
    diff = 0
    correct_to_wrong = 0
    wrong_to_correct = 0
    for i in range(len(list1)):
        if list1[i] == list2[i]:
            same += 1
        else:
            diff += 1
            
            if list1[i] == truth[i]:
                correct_to_wrong += 1
            elif list2[i] == truth[i]:        
                wrong_to_correct += 1

    print("same", same)
    print("different", diff)
    print("correct_to_wrong", correct_to_wrong)
    print("wrong_to_correct", wrong_to_correct)
    print("wrong_to_wrong", diff - correct_to_wrong - wrong_to_correct)
    
    return same, diff, correct_to_wrong, wrong_to_correct

In [None]:
truth = results["Truth"]

In [153]:
get_difference(pred_sum, pred_tr_sum, truth)

same 77
different 11
correct_to_wrong 3
wrong_to_correct 4
wrong_to_wrong 4


(77, 11, 3, 4)

In [155]:
get_difference(pred_desc, pred_tr_desc, truth)

same 72
different 16
correct_to_wrong 7
wrong_to_correct 7
wrong_to_wrong 2


(72, 16, 7, 7)

# Understandin NN - LRP

We get layers from output to input so that we can backpropagate.

Then we calculate word importances for each word in input.

In the current model there is only one conv-pool layer so the layer_count is 1. But in the medspecsearch models have 3 layers, so this model is different. We will use this model for LRP purposes.

( Maybe remove stop words? )

In [220]:
layer_count = 1

In [221]:
# Get weights, biases and activations to use in lrp method
weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='.*kernel.*')
biases = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='.*bias.*')

activations = []
if layer_count == 1:
    activations = [nnModel.cnnInput, nnModel.conv1, nnModel.blockPool, nnModel.h_pool_flat, nnModel.fc1, nnModel.scores]
    
elif layer_count == 3:   
    activations = [nnModel.cnnInput, nnModel.conv1, nnModel.blockPool, nnModel.conv2, nnModel.blockPool2, nnModel.conv3,
             nnModel.blockPool3, nnModel.h_pool_flat, nnModel.fc1, nnModel.scores]

weights.reverse()
biases.reverse()
activations.reverse()

In [222]:
# We have three parallel conv-pool couple.
# We need to split this ben backpropogating
# I was experiencing lots of bugs so I splitted it like this
# Need a better way for this
if layer_count == 3:
    biases_0 = np.array(biases)[[0,1,4]]
    weights_0 = np.array(weights)[[0,1,4]]
    activations_0 = np.array(activations)[[0,1,2,7,8,9]]

    biases_1 = np.array(biases)[[0,1,3]]
    weights_1 = np.array(weights)[[0,1,3]]
    activations_1 = np.array(activations)[[0,1,2,5,6,9]]

    biases_2 = np.array(biases)[[0,1,2]]
    weights_2 = np.array(weights)[[0,1,2]]
    activations_2 = np.array(activations)[[0,1,2,3,4,9]]

    biases_splitted = [biases_0, biases_1, biases_2]
    weights_splitted = [weights_0, weights_1, weights_2]
    activations_splitted = [activations_0, activations_1, activations_2]
    pool_biases = [[1,126,1,1], [1,125,1,1], [1,124,1,1]]

In [223]:
# To test
batch_x = trainData[0:21]
batch_y = trainTarget[0:21]
batch_x = EmbedModel.vectorizeBatch(batch_x)
batch_y = sess.run(tf.one_hot(batch_y,outputSize)) 

In [224]:
alpha = 1
backprop_layers = lrp.lrp_layers(alpha, layer_count, activations, weights, biases)

In [225]:
word_importances, results_combined = lrp.get_word_relevances(alpha, backprop_layers, layer_count, batch_x[0:1], trainData[0], sess, nnModel, activations, weights, biases)

In [226]:
word_importances

[('hello', -0.008268319347522032),
 ('doctor', -0.10804220545688706),
 ('i', 0.020342834952000215),
 ('have', -0.3017208265593638),
 ('burning', 0.44691333951505163),
 ('sensation', -0.005695214677773492),
 ('while', -0.006010371839961347),
 ('urinating', -0.016021748390750764),
 ('and', 0.0),
 ('a', 0.0),
 ('frequent', 0.16106670266116513),
 ('urge', -0.012891035388308568),
 ('to', 0.0),
 ('urinate', -0.7028770820335325),
 ('can', 0.04179418524252254),
 ('it', 0.09591374824574718),
 ('be', 0.2434632438935445),
 ('due', -0.12945842708610766),
 ('to', 0.0),
 ('sexual', -0.08989809784105451),
 ('contact', -0.2738250168551916),
 ('i', 0.020263399167046985),
 ('am', -0.007761515735184679),
 ('a', 0.0),
 ('year', 0.0011631088119264858),
 ('old', -0.002362008129093596),
 ('male', -0.07802539621714201),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 0.0),
 ('[None]', 

# TF-IDF

In [33]:
def get_word_imps_all_classes(path, default=True):
    dir_ = "default//"
    if not default:
        dir_ = "stemmed//"
        
    files = os.listdir(path + dir_ + "//")
    word_imps_all_classes = []
    for file in files:
        f = open(path + dir_ + "//" + file)
        tmp = []
        for line in f:
            tmp.append(line[0:-1].split(' '))
        tmp = tmp[1:] # remove title
        word_imps_all_classes.append(tmp)
    
    return word_imps_all_classes

# Methods for asking keywords to user

In [38]:
def process_user_input(user_input):
    user_input = DataLoader.DataHandler.cleanTextData(user_input)
    user_input = np.array(DataLoader.DataHandler.textIntoWordList(user_input, 128)[0])
    
    return user_input

def get_relevant_words(confidence_top3, amount, tfidf_words):
    relevant_words = []
    for i in range(len(confidence_top3)):
        category = confidence_top3[i][0]
        
        for words in tfidf_words[ClassDict[category]][0:amount]:      
            relevant_words.append(words)
    
    return relevant_words

# User input Cosine Similarity

This is the first option when asking keywords to user

In [100]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [129]:
def get_symptom_input_similarity(index, all_results, raw_user_input):
    user_result = all_results["Scores"][0]
    
    # calculate similarities for with symptoms
    cosine_similarities = []
    for i in range(len(softmax_results_symcat)):
        cosine_similarities.append(cosine_similarity(user_result, softmax_results_symcat[i]))

    cosine_similarities = np.array([a[0][0] for a in cosine_similarities])
    
    # get most similar symptom indexes
    highest_indexes = cosine_similarities.argsort()[:0:-1] 
    
    # we dont want to ask user symptoms that user already asked, so remove them
    # if keyword is present in user data skip it
    highest_relation = list(symptoms[highest_indexes])
    for item in raw_user_input.split(" "):
        for i, keyword in enumerate(highest_relation):
            if item in keyword:
                del highest_relation[i]

    return highest_relation[index]

## Example

In [136]:
raw_user_input = "my hair is transparent" # example input
user_input = process_user_input([raw_user_input])
all_results = evaluatePerformance(nnModel, sess, user_input, [0], 1, 1-confidence)

user_result = all_results["Scores"][0] # we will check similarity between user_result and softmax_results_symcat

In [137]:
# example 
# get most relevant symptom and ask this to user
get_symptom_input_similarity(0, all_results, raw_user_input)

'blood in urine'

In [138]:
# get second most relevant symptom and ask this to user
get_symptom_input_similarity(1, all_results, raw_user_input)

'vulvar symptoms'

# Cosine similarity for predicted top3 classes

This is the second option when asking keywords to users

In [85]:
def merge(text_array):
    result = ""
    
    for item in text_array:
        result += item + " "
        
    result = result[0:-1]
    
    return result

In [117]:
def get_next_symptom_top3class(index, choice,all_results, raw_user_input):
    """choice selects which class we should get keywords from, first second or third, first having highest confidence """
    # top3 classes for this prediction
    user_top3 = np.array(all_results["Top3"][0])[:,0]
    user_result = all_results["Scores"][0] 
    
    symptom_index = cosine_sim_indexes_all_classes[ClassDict[user_top3[choice]]][index]
    keyword_to_ask = symptoms[symptom_index]
    
    symptom_words = keyword_to_ask.split(' ')
    symptom_word_count = len(symptom_words)
    
    # count how many words user already explained
    included_word_count = 0
    for word in raw_user_input.split(' '):
        if word in symptom_words:
            included_word_count +=1
    
    # if explained words are more than 66% of symptom words then skip to next symptom
    if included_word_count / symptom_word_count > 0.66:
        return get_next_symptom(index+1, choice, user_results, raw_user_input)
    
    return keyword_to_ask   

In [118]:
# contains similarities for all classes with all symptoms
# for example, cosine_sim_indexes_all_classes[0] is Dermatology (ClassDict)
# cosine_sim_indexes_all_classes[0][0] is the index most relevant symptom in symptoms array
cosine_sim_indexes_all_classes = np.load("data//symcat//cosine_sim_indexes_all_classes.npy")

## Example

In [119]:
 # example input
raw_user_input = "my hair is transparent"

user_input = process_user_input([raw_user_input])
all_results = evaluatePerformance(nnModel, sess, user_input, [0], 1, 1-confidence)


In [120]:
# example
# return most relevant keyword for top rated category-class and ask this to user
get_next_symptom_top3class(0, 0,all_results, raw_user_input)

'skin irritation'

In [121]:
# return second most relevant keyword for top rated category-class and ask this to user
get_next_symptom_top3class(1, 0, all_results, raw_user_input)

'mouth symptoms'

In [122]:
# return most relevant keyword for second top rated category-class and ask this to user
get_next_symptom_top3class(0, 1, all_results, raw_user_input)

'problems with lymph nodes (glands)'