### In this notebook we will perform the word embedding & topic modeling & Cosine Similarity

***we merged the three chapters to perform the topic modeling, in order to perform cosine similarity to select which chapter the new input should go with.***

In [None]:
import pandas as pd
import numpy as np
import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

### Read the data and pickle file

In [None]:
df02 = pd.read_csv('all_chapters_3_rows.csv')

In [None]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    stop_words = pickle.load(fp)

In [None]:
df02.columns

Index(['string_values'], dtype='object')

In [None]:
# Declare a list that is to be converted into a column
ch_no = ['ear_nose', 'musculoskeletal', 'respiratory']

# Using 'ch_no' as the column name
# and equating it to the list
df02['Ch_No'] = ch_no

In [None]:
df02

Unnamed: 0,string_values,Ch_No
0,ear nose throat disorder introduction ear nose...,ear_nose
1,musculoskeletal disorder introduction complex ...,musculoskeletal
2,respiratory disorder introduction respiratory ...,respiratory


### Word Embedding

In [None]:
df02['string_values']

0    ear nose throat disorder introduction ear nose...
1    musculoskeletal disorder introduction complex ...
2    respiratory disorder introduction respiratory ...
Name: string_values, dtype: object

In [None]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(stop_words=stop_words)

doc_word_cv = count_vectorizer.fit_transform(df02['string_values'])



In [None]:
pd.DataFrame(doc_word_cv.toarray(), index=df02['Ch_No'], columns = count_vectorizer.get_feature_names_out()).head()

Unnamed: 0_level_0,aap,abdomen,abduct,abducted,abducting,abduction,abductor,abducts,abgs,ability,...,ﬁjacksﬂ,ﬁmicroatelectasisﬂ,ﬁout,ﬁpunched,ﬁslowing,ﬁtennis,ﬁthumbprintﬂ,ﬁvoice,ﬁwhiteoutsﬂ,ﬁwingﬂ
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ear_nose,0,0,0,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
musculoskeletal,0,1,2,2,2,10,1,1,0,2,...,1,0,0,1,0,1,0,0,0,1
respiratory,1,1,0,0,0,0,0,0,1,5,...,0,1,1,0,1,0,1,1,1,0


In [None]:
# Create a TfidfVectorizer for parsing/counting words
tfidf = TfidfVectorizer(stop_words=stop_words)

doc_word_tfidf = tfidf.fit_transform(df02['string_values'])



In [None]:
pd.DataFrame(doc_word_tfidf.toarray(), index=df02['Ch_No'], columns = tfidf.get_feature_names_out()).head()

Unnamed: 0_level_0,aap,abdomen,abduct,abducted,abducting,abduction,abductor,abducts,abgs,ability,...,ﬁjacksﬂ,ﬁmicroatelectasisﬂ,ﬁout,ﬁpunched,ﬁslowing,ﬁtennis,ﬁthumbprintﬂ,ﬁvoice,ﬁwhiteoutsﬂ,ﬁwingﬂ
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ear_nose,0.0,0.0,0.0,0.002908,0.0,0.0,0.0,0.0,0.0,0.004516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
musculoskeletal,0.0,0.002918,0.007673,0.005836,0.007673,0.038366,0.003837,0.003837,0.0,0.004532,...,0.003837,0.0,0.0,0.003837,0.0,0.003837,0.0,0.0,0.0,0.003837
respiratory,0.004619,0.003513,0.0,0.0,0.0,0.0,0.0,0.0,0.004619,0.013641,...,0.0,0.004619,0.004619,0.0,0.004619,0.0,0.004619,0.004619,0.004619,0.0


### Topic Modeling: **LDA**

In [None]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word_cv)

In [None]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [None]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)

In [None]:
lda.print_topics(3)

[(0,
  '0.332*"abduct" + 0.321*"abdomen" + 0.279*"aap" + 0.000*"phosphate" + 0.000*"phonation" + 0.000*"phosphatase" + 0.000*"phlebotomy" + 0.000*"phrase" + 0.000*"phosphatidylglycerol" + 0.000*"physically"'),
 (1,
  '0.001*"abdomen" + 0.000*"abduct" + 0.000*"aap" + 0.000*"phosphate" + 0.000*"phonation" + 0.000*"phosphatase" + 0.000*"phlebotomy" + 0.000*"phrase" + 0.000*"phosphatidylglycerol" + 0.000*"physically"'),
 (2,
  '0.001*"aap" + 0.000*"abdomen" + 0.000*"abduct" + 0.000*"phosphate" + 0.000*"phonation" + 0.000*"phosphatase" + 0.000*"phlebotomy" + 0.000*"phrase" + 0.000*"phosphatidylglycerol" + 0.000*"physically"')]

### Performing CorEx:

In [None]:
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

words = list(np.asarray(count_vectorizer.get_feature_names()))




In [None]:
topic_model = ct.Corex(n_hidden=3, words=words, seed=1)
topic_model.fit(doc_word_cv, words=words, docs=df02['string_values'])



<corextopic.corextopic.Corex at 0x7f7fd4614e50>

In [None]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aap,nontension,nonsmoker,nonrebreathing,nonputrid,nonpulmonaryšanxiety,nonmotile,noncaseating,noncardiac,nodosum
1: abduct,myelography,myelogram,musculature,mri,mouse,monosodium,monohydrate,moleskin,molecule
2: lingers,operating,oozing,olfactory,oily,offensive,occursñmost,option,obviously,obstructs


### Topic Modeling: LSA

In [None]:
lsa = TruncatedSVD(3)
doc_topic = lsa.fit_transform(doc_word_cv)
print(lsa.explained_variance_ratio_)

[0.047535   0.57893652 0.37352848]


In [None]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ['component'+str(i) for i in range(3)],
             columns = count_vectorizer.get_feature_names_out())

print(topic_word)

              aap  abdomen  abduct  abducted  abducting  abduction  abductor  \
component0  0.001    0.003   0.003     0.005      0.003      0.017     0.002   
component1  0.000   -0.002  -0.004    -0.002     -0.004     -0.022    -0.002   
component2  0.004    0.003  -0.002    -0.003     -0.002     -0.010    -0.001   

            abducts   abgs  ability  ...  ﬁjacksﬂ  ﬁmicroatelectasisﬂ   ﬁout  \
component0    0.002  0.001    0.011  ...    0.002               0.001  0.001   
component1   -0.002  0.000    0.001  ...   -0.002               0.000  0.000   
component2   -0.001  0.004    0.014  ...   -0.001               0.004  0.004   

            ﬁpunched  ﬁslowing  ﬁtennis  ﬁthumbprintﬂ  ﬁvoice  ﬁwhiteoutsﬂ  \
component0     0.002     0.001    0.002         0.001   0.001        0.001   
component1    -0.002     0.000   -0.002         0.000   0.000        0.000   
component2    -0.001     0.004   -0.001         0.004   0.004        0.004   

            ﬁwingﬂ  
component0   0.002  
com

In [None]:
tem_list = []
def display_topics(model, feature_names, no_top_words, topic_names=None):

    for ix, topic in enumerate(model.components_):
        inner_tem_list = []

        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")

        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        inner_tem_list.append(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        tem_list.append(inner_tem_list)

In [None]:
result1 = display_topics(lsa, count_vectorizer.get_feature_names_out(), 20)


Topic  0
bone, muscle, ear, otitis, hearing, medium, membrane, bleeding, airway, deformity, obstruction, cord, canal, hip, abscess, throat, oxygen, ventilation, attack, nose

Topic  1
ear, otitis, hearing, medium, throat, sinusitis, nose, bleeding, membrane, externa, obstruction, sinus, septum, abscess, polyp, cord, canal, speech, airway, voice

Topic  2
ventilation, oxygen, airway, copd, breathing, alveolus, acidosis, hg, bronchiectasis, embolus, crackle, artery, collapse, silicosis, distress, inspiration, obstruction, cwp, pulse, well


In [None]:
tem_list
final_dic = {}
final_dic["Bone"] = tem_list[0]
final_dic["Ear"] = tem_list[1]
final_dic["Breathing"] = tem_list[2]

In [None]:
final_dic

{'Bone': ['bone, muscle, ear, otitis, hearing, medium, membrane, bleeding, airway, deformity, obstruction, cord, canal, hip, abscess, throat, oxygen, ventilation, attack, nose'],
 'Ear': ['ear, otitis, hearing, medium, throat, sinusitis, nose, bleeding, membrane, externa, obstruction, sinus, septum, abscess, polyp, cord, canal, speech, airway, voice'],
 'Breathing': ['ventilation, oxygen, airway, copd, breathing, alveolus, acidosis, hg, bronchiectasis, embolus, crackle, artery, collapse, silicosis, distress, inspiration, obstruction, cwp, pulse, well']}

In [None]:
tem_df = pd.DataFrame.from_dict(final_dic, orient ='index')
tem_df

Unnamed: 0,0
Bone,"bone, muscle, ear, otitis, hearing, medium, me..."
Ear,"ear, otitis, hearing, medium, throat, sinusiti..."
Breathing,"ventilation, oxygen, airway, copd, breathing, ..."


In [None]:
# Declare a list that is to be converted into a column
d_name = ['musculoskeletal', 'ear_nose', 'respiratory']

# Using 'ch_no' as the column name
# and equating it to the list
tem_df['D_Name'] = d_name

In [None]:
tem_df.columns

Index([0, 'D_Name'], dtype='object')

In [None]:
tem_df = tem_df.rename(columns={0: 'Description'})
tem_df

Unnamed: 0,Description,D_Name
Bone,"bone, muscle, ear, otitis, hearing, medium, me...",musculoskeletal
Ear,"ear, otitis, hearing, medium, throat, sinusiti...",ear_nose
Breathing,"ventilation, oxygen, airway, copd, breathing, ...",respiratory


In [None]:
tem_df.to_csv('diseases_with_description.csv', index=False)