### In this notebook we will perform the word embedding & topic modeling & Cosine Similarity

***we merged the **three** chapters to perform the topic modeling, in order to perform cosine similarity to select which chapter the new input should go with.***

In [15]:
import pandas as pd
import numpy as np
import pickle

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

### Read the data and pickle file

In [31]:
df02 = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\Diagnosis\Notebook\Current Medical Diagnosis1.csv')

In [32]:
# reading the stop words list with pickle
with open (r'C:\Users\HP\OneDrive\Desktop\Diagnosis\Notebook\current medical diagnosis1.ob', 'rb') as fp:
    stop_words = pickle.load(fp)

In [34]:
df02.head(10)

Unnamed: 0,string_values
0,prevention health promotion tobacco tobacco ht...
1,diabetes life islet west diabetes mellitust yp...
2,beverage brainstem aura blindness field thria ...
3,kidney proteinuria found overload proteinuria ...
4,chapter ated bp reading office pharmacy coat o...
5,chapter per redness swelling conjunctiva compr...


In [35]:
# Declare a list that is to be converted into a column
ch_no = ['Cancer','Diabetes', 'Nervous System', 'Kidney Disease', 'Hypertension', 'Ear, Nose, & Throat Disorders ']
# Using 'ch_no' as the column name
# and equating it to the list
df02['Topic'] = ch_no

In [36]:
df02

Unnamed: 0,string_values,Topic
0,prevention health promotion tobacco tobacco ht...,Cancer
1,diabetes life islet west diabetes mellitust yp...,Diabetes
2,beverage brainstem aura blindness field thria ...,Nervous System
3,kidney proteinuria found overload proteinuria ...,Kidney Disease
4,chapter ated bp reading office pharmacy coat o...,Hypertension
5,chapter per redness swelling conjunctiva compr...,"Ear, Nose, & Throat Disorders"


### Word Embedding

In [37]:
df02['string_values']

0    prevention health promotion tobacco tobacco ht...
1    diabetes life islet west diabetes mellitust yp...
2    beverage brainstem aura blindness field thria ...
3    kidney proteinuria found overload proteinuria ...
4    chapter ated bp reading office pharmacy coat o...
5    chapter per redness swelling conjunctiva compr...
Name: string_values, dtype: object

In [38]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(stop_words=stop_words)

doc_word_cv = count_vectorizer.fit_transform(df02['string_values'])



In [39]:
pd.DataFrame(doc_word_cv.toarray(), index=df02['Topic'], columns = count_vectorizer.get_feature_names_out()).head()

Unnamed: 0_level_0,aa,aaa,ab,abbreviated,abcd,abdomen,abdomyolysis,abdomyolysise,abducens,abduction,...,ºmultisystem,ºnephrotic,ºnonatherosclerotic,ºperipheral,ºprimary,ºshocke,ºthe,ºtubulointerstitial,ºtypes,ºvenous
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cancer,1,0,0,0,0,11,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
Diabetes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Nervous System,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
Kidney Disease,5,0,0,1,0,0,1,1,0,0,...,1,2,0,0,0,0,0,1,0,0
Hypertension,2,2,3,0,1,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [40]:
# Create a TfidfVectorizer for parsing/counting words
tfidf = TfidfVectorizer(stop_words=stop_words)

doc_word_tfidf = tfidf.fit_transform(df02['string_values'])



In [41]:
pd.DataFrame(doc_word_tfidf.toarray(), index=df02['Topic'], columns = tfidf.get_feature_names_out()).head()

Unnamed: 0_level_0,aa,aaa,ab,abbreviated,abcd,abdomen,abdomyolysis,abdomyolysise,abducens,abduction,...,ºmultisystem,ºnephrotic,ºnonatherosclerotic,ºperipheral,ºprimary,ºshocke,ºthe,ºtubulointerstitial,ºtypes,ºvenous
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cancer,0.001125,0.0,0.0,0.0,0.0,0.017107,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001897,0.0,0.0,0.0,0.001897,0.0
Diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.008903,0.0,0.0,0.0
Nervous System,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004431,0.0,...,0.0,0.0,0.0,0.004431,0.0,0.0,0.0,0.0,0.0,0.0
Kidney Disease,0.017186,0.0,0.0,0.005794,0.0,0.0,0.005794,0.005794,0.0,0.0,...,0.005794,0.011588,0.0,0.0,0.0,0.0,0.0,0.005794,0.0,0.0
Hypertension,0.005934,0.010003,0.015004,0.0,0.005001,0.004101,0.0,0.0,0.0,0.0,...,0.0,0.0,0.005001,0.0,0.0,0.005001,0.0,0.0,0.0,0.005001


### Topic Modeling: **LDA**

In [42]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word_cv)

In [43]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [45]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=6, id2word=id2word, passes=5)

In [47]:
lda.print_topics(7)

[(0,
  '0.611*"aa" + 0.316*"abcd" + 0.004*"ab" + 0.003*"abdomen" + 0.002*"abbreviated" + 0.001*"aaa" + 0.000*"philia" + 0.000*"phlebitis" + 0.000*"phomas" + 0.000*"phila"'),
 (1,
  '0.670*"aaa" + 0.003*"abcd" + 0.000*"abdomen" + 0.000*"aa" + 0.000*"abbreviated" + 0.000*"ab" + 0.000*"philia" + 0.000*"phlebitis" + 0.000*"phomas" + 0.000*"phila"'),
 (2,
  '0.000*"abcd" + 0.000*"abbreviated" + 0.000*"aaa" + 0.000*"ab" + 0.000*"abdomen" + 0.000*"aa" + 0.000*"philia" + 0.000*"phlebitis" + 0.000*"phomas" + 0.000*"phila"'),
 (3,
  '0.000*"abcd" + 0.000*"aaa" + 0.000*"aa" + 0.000*"ab" + 0.000*"abdomen" + 0.000*"abbreviated" + 0.000*"philia" + 0.000*"phlebitis" + 0.000*"phomas" + 0.000*"phila"'),
 (4,
  '0.841*"ab" + 0.015*"abdomen" + 0.005*"abcd" + 0.000*"aaa" + 0.000*"aa" + 0.000*"abbreviated" + 0.000*"philia" + 0.000*"phlebitis" + 0.000*"phomas" + 0.000*"phila"'),
 (5,
  '0.539*"abdomen" + 0.328*"abbreviated" + 0.002*"abcd" + 0.001*"aa" + 0.000*"ab" + 0.000*"aaa" + 0.000*"philia" + 0.000*"phl

### Performing CorEx:

In [48]:
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

words = list(np.asarray(count_vectorizer.get_feature_names_out()))


In [49]:
topic_model = ct.Corex(n_hidden=6, words=words, seed=1)
topic_model.fit(doc_word_cv, words=words, docs=df02['string_values'])



<corextopic.corextopic.Corex at 0x1a6b3ebb590>

In [50]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: phenytoin,characterized,hospital,analog,fetus,vasospasm,radiating,stimulating,steal,reflect
1: bedtime,advised,extremely,swelling,betes,ingestion,staging,construction,phosphorus,transformation
2: length,tenderness,concern,sleep,blindness,kept,shift,atherosclerosis,burning,ifosfamide
3: canagliflozin,bind,atlas,na,sm,ace,voice,ke,ganglion,drained
4: novo,icu,ckd,cardiol,lism,acidosis,preservation,excision,structure,alopecia
5: transplantation,doubling,occult,algorithm,bound,load,alt,assay,presented,cardiovasc


### Topic Modeling: LSA

In [51]:
lsa = TruncatedSVD(6)
doc_topic = lsa.fit_transform(doc_word_cv)
print(lsa.explained_variance_ratio_)

[0.62864254 0.1057321  0.09450078 0.08058466 0.05872876 0.03181116]


In [52]:
topic_word = pd.DataFrame(lsa.components_.round(6),
             index = ['component'+str(i) for i in range(6)],
             columns = count_vectorizer.get_feature_names_out())

topic_word.head()

Unnamed: 0,aa,aaa,ab,abbreviated,abcd,abdomen,abdomyolysis,abdomyolysise,abducens,abduction,...,ºmultisystem,ºnephrotic,ºnonatherosclerotic,ºperipheral,ºprimary,ºshocke,ºthe,ºtubulointerstitial,ºtypes,ºvenous
component0,0.001717,0.000246,0.00037,5.6e-05,0.000123,0.012531,5.6e-05,5.6e-05,0.000142,0.000125,...,5.6e-05,0.000112,0.000123,0.000142,0.001128,0.000123,4.9e-05,5.6e-05,0.001128,0.000123
component1,0.005,0.002852,0.004278,0.000379,0.001426,-0.003332,0.000379,0.000379,0.001584,0.001369,...,0.000379,0.000758,0.001426,0.001584,-0.000433,0.001426,0.000479,0.000379,-0.000433,0.001426
component2,-0.010933,-0.003888,-0.005832,-0.001261,-0.001944,-0.001737,-0.001261,-0.001261,0.00247,-0.001515,...,-0.001261,-0.002523,-0.001944,0.00247,1.9e-05,-0.001944,-0.000285,-0.001261,1.9e-05,-0.001944
component3,-0.001817,-0.002429,-0.003644,-0.000589,-0.001215,-0.001174,-0.000589,-0.000589,-0.000204,0.007108,...,-0.000589,-0.001178,-0.001215,-0.000204,4e-06,-0.001215,-0.000317,-0.000589,4e-06,-0.001215
component4,0.015781,-0.003661,-0.005491,0.00389,-0.00183,-0.002733,0.00389,0.00389,0.000595,0.000147,...,0.00389,0.00778,-0.00183,0.000595,-8.2e-05,-0.00183,0.000224,0.00389,-8.2e-05,-0.00183


In [53]:
tem_list = [] 
def display_topics(model, feature_names, no_top_words, topic_names=None):
    
    for ix, topic in enumerate(model.components_):
        inner_tem_list = []
       
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
            
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        inner_tem_list.append(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        tem_list.append(inner_tem_list)

In [54]:
result1 = display_topics(lsa, count_vectorizer.get_feature_names_out(), 20)


Topic  0
cancer, tumor, survival, carcinoma, prostate, metastasis, resection, screening, nausea, ct, radiation, orally, men, agent, psa, combination, inhibitor, liver, diarrhea, node

Topic  1
artery, seizure, hg, study, kidney, weakness, brain, three, agent, ace, table, given, diabetes, minute, leg, trial, ct, inhibitor, vessel, mri

Topic  2
seizure, weakness, brain, mri, dementia, ct, epilepsy, ataxia, tumor, impairment, nausea, given, brainstem, relapse, depending, gene, basis, deficit, disability, alzheimer

Topic  3
drop, glaucoma, vision, solution, occlusion, uveitis, keratitis, cataract, detachment, hemorrhage, disk, four, cornea, lens, conjunctivitis, referred, ophthalmol, degeneration, laser, herpes

Topic  4
ckd, gfr, aki, eskd, dialysis, cyst, kidney, plasma, hematuria, seizure, iga, acidosis, tract, phosphorus, obstruction, sodium, laboratory, dis, vasculitis, microscopy

Topic  5
study, kidney, given, unit, liver, glucose, dka, benefit, three, diabetes, pen, egfr, pmdiab

In [55]:
tem_list
final_dic = {}
final_dic["Cancer"] = tem_list[0]
final_dic["Diabetes"] = tem_list[1]
final_dic["Nervous System"] = tem_list[2]
final_dic["Kidney"] = tem_list[3]
final_dic["Hypertension"] = tem_list[4]
final_dic["Ear & Nose"] = tem_list[5]

In [56]:
final_dic

{'Cancer': ['cancer, tumor, survival, carcinoma, prostate, metastasis, resection, screening, nausea, ct, radiation, orally, men, agent, psa, combination, inhibitor, liver, diarrhea, node'],
 'Diabetes': ['artery, seizure, hg, study, kidney, weakness, brain, three, agent, ace, table, given, diabetes, minute, leg, trial, ct, inhibitor, vessel, mri'],
 'Nervous System': ['seizure, weakness, brain, mri, dementia, ct, epilepsy, ataxia, tumor, impairment, nausea, given, brainstem, relapse, depending, gene, basis, deficit, disability, alzheimer'],
 'Kidney': ['drop, glaucoma, vision, solution, occlusion, uveitis, keratitis, cataract, detachment, hemorrhage, disk, four, cornea, lens, conjunctivitis, referred, ophthalmol, degeneration, laser, herpes'],
 'Hypertension': ['ckd, gfr, aki, eskd, dialysis, cyst, kidney, plasma, hematuria, seizure, iga, acidosis, tract, phosphorus, obstruction, sodium, laboratory, dis, vasculitis, microscopy'],
 'Ear & Nose': ['study, kidney, given, unit, liver, gluc

In [57]:
tem_df = pd.DataFrame.from_dict(final_dic, orient ='index') 
tem_df

Unnamed: 0,0
Cancer,"cancer, tumor, survival, carcinoma, prostate, ..."
Diabetes,"artery, seizure, hg, study, kidney, weakness, ..."
Nervous System,"seizure, weakness, brain, mri, dementia, ct, e..."
Kidney,"drop, glaucoma, vision, solution, occlusion, u..."
Hypertension,"ckd, gfr, aki, eskd, dialysis, cyst, kidney, p..."
Ear & Nose,"study, kidney, given, unit, liver, glucose, dk..."


In [58]:
# Declare a list that is to be converted into a column
d_name = ['Cancer', 'Diabetes', 'Nervous System', 'Kidney', 'Hypertension', 'Ear & Nose']
 
# Using 'ch_no' as the column name
# and equating it to the list
tem_df['D_Name'] = d_name

In [59]:
tem_df.columns

Index([0, 'D_Name'], dtype='object')

In [60]:
tem_df = tem_df.rename(columns={0: 'Description'})
tem_df

Unnamed: 0,Description,D_Name
Cancer,"cancer, tumor, survival, carcinoma, prostate, ...",Cancer
Diabetes,"artery, seizure, hg, study, kidney, weakness, ...",Diabetes
Nervous System,"seizure, weakness, brain, mri, dementia, ct, e...",Nervous System
Kidney,"drop, glaucoma, vision, solution, occlusion, u...",Kidney
Hypertension,"ckd, gfr, aki, eskd, dialysis, cyst, kidney, p...",Hypertension
Ear & Nose,"study, kidney, given, unit, liver, glucose, dk...",Ear & Nose


In [63]:
tem_df.to_csv('description_of_deseases.csv', index=False)

In [65]:
import os
cwd = os.getcwd()
cwd

'c:\\Users\\HP\\Downloads'