In [None]:
!pip install sentence-transformers

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
dataset = pd.read_json('https://raw.githubusercontent.com/LasseRegin/medical-question-answer-data/master/questionDoctorQAs.json')
dataset.head()

Unnamed: 0,answer,answer_author,question,question_text,tags,url
0,you can stay assured you are fine free of hiv....,Dr. Ayman Darrag,is my anti hiv test conclusive or need retest?,Is my Anti Hiv Test Conclusive or need retest?,[hiv test],https://questiondoctors.com/is-my-anti-hiv-tes...
1,hi you are 100 % hiv free good luck,Dr Ahmed Fawzy,is my anti hiv test conclusive or need retest?,Is my Anti Hiv Test Conclusive or need retest?,[hiv test],https://questiondoctors.com/is-my-anti-hiv-tes...
2,your time slots of the tests and the final res...,Dr.Honey,is my anti hiv test conclusive or need retest?,Is my Anti Hiv Test Conclusive or need retest?,[hiv test],https://questiondoctors.com/is-my-anti-hiv-tes...
3,hi i see no labrum tear however i see acetabul...,Dr Ahmed Fawzy,i have some hip pain 9 weeks. had mra image re...,I have some hip pain 9 weeks. Had MRA image re...,[magnetic resonance angiography (mra)],https://questiondoctors.com/i-have-some-hip-pa...
4,no visible tear in labrum but obviously calcif...,Dr. Ayman Darrag,i have some hip pain 9 weeks. had mra image re...,I have some hip pain 9 weeks. Had MRA image re...,[magnetic resonance angiography (mra)],https://questiondoctors.com/i-have-some-hip-pa...


In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
questions = dataset['question']
questions.head()

0       is my anti hiv test conclusive or need retest?
1       is my anti hiv test conclusive or need retest?
2       is my anti hiv test conclusive or need retest?
3    i have some hip pain 9 weeks. had mra image re...
4    i have some hip pain 9 weeks. had mra image re...
Name: question, dtype: object

In [None]:
questions = questions.unique()
questions.shape

(1890,)

In [None]:
question_embeddings = model.encode(questions)

In [None]:
question_embeddings.shape

(1890, 768)

In [None]:
test_sentence = "What is my HIV result ?"
test_embedding = model.encode([test_sentence])
test_embedding.shape

(1, 768)

In [None]:
similarity_array = cosine_similarity(
    [test_embedding[0]],
    question_embeddings
)

question_array = questions.copy()

df = pd.DataFrame(np.vstack((question_array, similarity_array)).transpose(), columns=['question', 'score'])
df.head()

Unnamed: 0,question,score
0,is my anti hiv test conclusive or need retest?,0.770897
1,i have some hip pain 9 weeks. had mra image re...,0.449711
2,i developed breathing difficulties approx 2 ye...,0.3278
3,low resting heart rate can it be a secondary a...,0.537536
4,i was recently diagnosed with an atypical mole...,0.566107


In [None]:
df.sort_values(by=['score'], ascending=False)

Unnamed: 0,question,score
1878,hiv and aids?,0.879062
1238,can i get hiv by using infected towel?,0.806983
0,is my anti hiv test conclusive or need retest?,0.770897
530,i have severe peripheral neuropathy symptoms a...,0.743243
1660,i’m a nurse i may have pricked myself with a h...,0.738458
...,...,...
233,i take prilosec 20mgs (otc) and have taken thi...,0.144776
1695,pure blood in urine…quite a lot,0.132295
526,had 8 teeth removed swelling has increased eve...,0.127342
1818,for the past week now my heart has been poundi...,0.117249


In [None]:
from sklearn.cluster import KMeans

n_clusters = 1000
kmeans = KMeans(n_clusters = n_clusters).fit(question_embeddings)

In [None]:
indexes = np.where(kmeans.labels_ == 1)
questions[indexes]

array(['i’ve been having severe headaches and my mouth feels very dry and bloated tummy?',
       'i have been experiencing very shallow breathing shivers legs feel numb?',
       'feel very depressed fingernails and toenails very brittle. hair loss. hair is kinky straw like. pale skin. insomnia. suffer from copper deficiency.',
       'i have a strange medical problem feel weak fatigued dizzy chilled and cloudy headed terrible skin sores?',
       'i have numbness in my hands and feet frequent urination forgetfulness and anxiety. what is wrong with me?'],
      dtype=object)

In [None]:
indexes = np.where(kmeans.labels_ == 2)
questions[indexes]

array(['two years ago i was diagnosed with colon cancer?',
       'i’m 31 years old i’ve been diagnosed with digital papillary carcinoma what are the treatments?',
       'i am a 32 year old female diagnosed with gaucher disease type 1 help me understand my test results?',
       'my husband is 29 and has facial paralysis may be developing synkinesis what can be done next?'],
      dtype=object)

TF-IDF

In [None]:
nltk.download('popular', quiet=True) # for downloading packages
nltk.download('punkt') # first-time use only
nltk.download('wordnet') # first-time use only

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import string

lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(word_tokens):
    return [lemmer.lemmatize(token) for token in word_tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))


TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(questions)   

  % sorted(inconsistent)


In [None]:
tfidf.shape

(1890, 3352)

In [None]:
n_clusters_tfidf = 10
kmeans_tfidf = KMeans(n_clusters = n_clusters).fit(tfidf)

In [None]:
indexes = np.where(kmeans_tfidf.labels_ == 1)
questions[indexes]

array(['i have struggled with a urinary tract infection (uti) been on antibiotics 3 times?',
       'i have short urinary tract (yuretra) and also burning in urinary tract. what medicine is good?',
       'having recurrent urinary tract infections (utis) this has been going on for the last 3 years?',
       'abdominal pain…is it a urinary tract infection?'], dtype=object)

In [None]:
indexes = np.where(kmeans_tfidf.labels_ == 2)
questions[indexes]

array(['my eldest daughter is suffering excruciating pain need second opinion?',
       'i have been suffering from prostatitis there is presence of pseudomonas fluorescens need second opinion?'],
      dtype=object)

In [None]:
indexes = np.where(kmeans_tfidf.labels_ == 3)
questions[indexes]

array(['i started with fever throat pain getting worse worried my doctor not giving me right treatment',
       'i have scoliosis i’m worried about it getting worse?'],
      dtype=object)

In [None]:
indexes = np.where(kmeans_tfidf.labels_ == 4)
questions[indexes]

array(['i have a constant dull pain in my right upper chest about even with my armpit?',
       'i have pain higher up on the left side of my chest and in my left armpit',
       'i have chronic pain on the left side of my back that spreads to the left side of my stomach?'],
      dtype=object)

In [None]:
indexes = np.where(kmeans_tfidf.labels_ == 9)
questions[indexes]

array(['i’ve been diagnosed with relapsing-remitting multiple sclerosis (rrms) losing weight can it be lyme disease?',
       'i’ve been diagnosed with ms (multiple sclerosis)',
       'i’m 31 years old i’ve been diagnosed with digital papillary carcinoma what are the treatments?',
       'i have strange symptoms is it multiple sclerosis (ms) or amyotrophic lateral sclerosis (als)-lou gehrig’s disease?',
       '23 years old multiple sclerosis (ms) patient for about 10 years is medication necessary?',
       'i have suffered from random intermittent neuropathic pain is it early symtoms of multiple sclerosis (ms)?',
       'do i have multiple sclerosis i’ve been having a lot of pain in my legs and a lot of muscle spasms?',
       'i am deteriorating quickly – do i have multiple sclerosis (ms)?'],
      dtype=object)

In [None]:
tfidf[indexes].argmax(axis=1)

matrix([[2479],
        [2621],
        [ 925],
        [2621],
        [2023],
        [1026],
        [1819],
        [ 890]])

In [None]:
tfidf.shape

(1890, 3352)

In [None]:
len(TfidfVec.get_feature_names())



3352

In [None]:
features = np.array(TfidfVec.get_feature_names())
max_value_indexes = np.squeeze(np.asarray(tfidf[indexes].argmax(axis=1)))
features[max_value_indexes]



array(['relapsingremitting', 'sclerosis', 'digital', 'sclerosis',
       'necessary', 'early', 'lot', 'deteriorating'], dtype='<U26')

In [None]:
for v in tfidf[indexes][0]:
  print(v)

  (0, 941)	0.23813021671569928
  (0, 1834)	0.38107915299640627
  (0, 3263)	0.2893816820844013
  (0, 1816)	0.2987124130992891
  (0, 2568)	0.40182977583117757
  (0, 2621)	0.33088292775394385
  (0, 1996)	0.2893816820844013
  (0, 2479)	0.40182977583117757
  (0, 3199)	0.21915061256934226
  (0, 3341)	0.1416183632903399
  (0, 902)	0.2021372213824986


In [None]:
features[2479]

'relapsingremitting'

In [None]:
features[2568]

'rrms'

In [None]:
features[2621]

'sclerosis'