In [4]:
import pandas as pd
import pickle

In [5]:
with open('meta_words', 'rb') as file:
    meta_data = pickle.load(file)

df = pd.DataFrame(meta_data)
df.columns

Index(['cord_uid', 'title', 'abs_n', 'abs_v', 'abs_aj', 'abs_av'], dtype='object')

In [6]:
df['title'].head()

0    [clinical, features, culture-proven, mycoplasm...
1    [nitric, oxide, pro-inflammatory, mediator, lu...
2    [surfactant, protein-d, pulmonary, host, defense]
3                  [role, endothelin-1, lung, disease]
4    [gene, expression, epithelial, cells, response...
Name: title, dtype: object

In [7]:
from gensim import corpora, models
from pprint import pprint

In [8]:
data = df['title'].apply(lambda x: [str(x)] if not isinstance(x, list) else [str(item) for item in x])
dictionary = corpora.Dictionary(data)

In [9]:
corpus = [dictionary.doc2bow(doc) for doc in data]

In [10]:
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=35)

In [11]:
pprint(lda_model.print_topics())

[(0,
  '0.037*"de" + 0.029*"covid-19" + 0.021*"la" + 0.017*"symptoms" + '
  '0.016*"lessons" + 0.013*"cells" + 0.012*"expression" + 0.010*"sars" + '
  '0.008*"protease" + 0.008*"gene"'),
 (1,
  '0.016*"analysis" + 0.014*"data" + 0.011*"people" + 0.011*"infectious" + '
  '0.011*"diseases" + 0.011*"online" + 0.011*"based" + 0.011*"model" + '
  '0.010*"the" + 0.010*"transmission"'),
 (2,
  '0.028*"potential" + 0.014*"era" + 0.013*"en" + 0.012*"therapeutic" + '
  '0.010*"training" + 0.010*"treatment" + 0.009*"blood" + 0.009*"in" + '
  '0.008*"therapy" + 0.008*"ace2"'),
 (3,
  '0.084*"covid-19" + 0.041*"pandemic" + 0.027*"coronavirus" + 0.026*"study" + '
  '0.023*"a" + 0.021*"covid‐19" + 0.019*"among" + 0.016*"the" + 0.016*"impact" '
  '+ 0.015*"during"'),
 (4,
  '0.041*"review" + 0.039*"a" + 0.038*"patients" + 0.037*"covid-19" + '
  '0.030*"study" + 0.029*"respiratory" + 0.022*"clinical" + 0.018*"systematic" '
  '+ 0.015*"acute" + 0.014*"syndrome"'),
 (5,
  '0.084*"covid-19" + 0.046*"patie

In [12]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.037*"de" + 0.029*"covid-19" + 0.021*"la" + 0.017*"symptoms" + 0.016*"lessons" + 0.013*"cells" + 0.012*"expression" + 0.010*"sars" + 0.008*"protease" + 0.008*"gene"')
(1, '0.016*"analysis" + 0.014*"data" + 0.011*"people" + 0.011*"infectious" + 0.011*"diseases" + 0.011*"online" + 0.011*"based" + 0.011*"model" + 0.010*"the" + 0.010*"transmission"')
(2, '0.028*"potential" + 0.014*"era" + 0.013*"en" + 0.012*"therapeutic" + 0.010*"training" + 0.010*"treatment" + 0.009*"blood" + 0.009*"in" + 0.008*"therapy" + 0.008*"ace2"')
(3, '0.084*"covid-19" + 0.041*"pandemic" + 0.027*"coronavirus" + 0.026*"study" + 0.023*"a" + 0.021*"covid‐19" + 0.019*"among" + 0.016*"the" + 0.016*"impact" + 0.015*"during"')
(4, '0.041*"review" + 0.039*"a" + 0.038*"patients" + 0.037*"covid-19" + 0.030*"study" + 0.029*"respiratory" + 0.022*"clinical" + 0.018*"systematic" + 0.015*"acute" + 0.014*"syndrome"')
(5, '0.084*"covid-19" + 0.046*"patients" + 0.035*"case" + 0.027*"disease" + 0.020*"a" + 0.019*"report" + 0.01

In [26]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting numexpr (from pyLDAvis)
  Downloading numexpr-2.8.7-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.7 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting scikit-learn>=1.0.0 (from pyLDAvis)
  Downloading scikit_learn-1.3.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn>=1.0.0->pyLDAvis)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)
[2K

In [13]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [15]:
pyLDAvis.display(vis_data)

In [16]:
pyLDAvis.save_html(vis_data, 'lda_visualization.html')