In [1]:
import pandas as pd
import pickle

In [2]:
with open('meta_words', 'rb') as file:
    meta_data = pickle.load(file)

df = pd.DataFrame(meta_data)
df.columns

Index(['cord_uid', 'title', 'abs_n', 'abs_v', 'abs_aj', 'abs_av'], dtype='object')

In [3]:
df.head()

Unnamed: 0,cord_uid,title,abs_n,abs_v,abs_aj,abs_av
0,ug7v899j,"[clinical, features, culture-proven, mycoplasm...","[objective, retrospective, chart, review, desc...","[identify, review, identify, require, occur, a...","[clinical, positive, most, community-acquired,...",[more]
1,02tnwd4m,"[nitric, oxide, pro-inflammatory, mediator, lu...","[inflammatory, disease, respiratory, tract, pr...","[associate, elevate, increase, know, presume, ...","[nitric, oxidative, anti-microbial, anti-oxida...","[commonly, often, comprehensively, instead]"
2,ejv2xln0,"[surfactant, protein-d, pulmonary, host, defense]","[surfactant, protein-d, sp-d, participates, in...","[inhale, synthesize, secrete, express, line, e...","[organic, epithelial, epithelial, various, gen...","[also, also, specifically, abnormally, appropr..."
3,2b73a28n,"[role, endothelin-1, lung, disease]","[endothelin-1, et-1, amino, acid, peptide, div...",[implicate],"[biological, numerous, pulmonary]",[]
4,9785vg6d,"[gene, expression, epithelial, cells, response...","[respiratory, virus, rsv, pneumonia, virus, mo...","[include, understand, include]","[syncytial, important, epithelial, viral, spec...","[subfamily, clinically, respectively, incomple..."


In [4]:
df['title'].head()

0    [clinical, features, culture-proven, mycoplasm...
1    [nitric, oxide, pro-inflammatory, mediator, lu...
2    [surfactant, protein-d, pulmonary, host, defense]
3                  [role, endothelin-1, lung, disease]
4    [gene, expression, epithelial, cells, response...
Name: title, dtype: object

In [5]:
from gensim import corpora, models
from pprint import pprint

In [6]:
data = df['title'].apply(lambda x: [str(x)] if not isinstance(x, list) else [str(item) for item in x])
dictionary = corpora.Dictionary(data)

In [7]:
corpus = [dictionary.doc2bow(doc) for doc in data]

In [8]:
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [9]:
pprint(lda_model.print_topics())

[(0,
  '0.068*"covid-19" + 0.042*"pandemic" + 0.024*"health" + 0.011*"impact" + '
  '0.011*"care" + 0.010*"among" + 0.009*"study" + 0.007*"social" + 0.007*"de" '
  '+ 0.007*"healthcare"'),
 (1,
  '0.080*"covid-19" + 0.039*"patients" + 0.015*"disease" + 0.015*"review" + '
  '0.014*"case" + 0.013*"study" + 0.011*"clinical" + 0.011*"acute" + '
  '0.010*"covid‐19" + 0.009*"severe"'),
 (2,
  '0.027*"using" + 0.016*"detection" + 0.011*"based" + 0.011*"model" + '
  '0.010*"learning" + 0.010*"analysis" + 0.009*"approach" + 0.009*"data" + '
  '0.008*"rapid" + 0.007*"network"'),
 (3,
  '0.030*"study" + 0.013*"trial" + 0.011*"protocol" + 0.009*"people" + '
  '0.009*"children" + 0.009*"psychological" + 0.009*"practice" + '
  '0.009*"online" + 0.008*"hospitalized" + 0.008*"randomized"'),
 (4,
  '0.048*"sars-cov-2" + 0.024*"coronavirus" + 0.014*"infection" + '
  '0.013*"virus" + 0.010*"human" + 0.009*"viral" + 0.008*"disease" + '
  '0.008*"potential" + 0.007*"respiratory" + 0.007*"protein"')]


In [9]:
topics = lda_model.print_topics(num_words=100)
for topic in topics:
    print(topic)

(0, '0.031*"covid-19" + 0.018*"using" + 0.015*"de" + 0.012*"learning" + 0.012*"medical" + 0.010*"cross-sectional" + 0.008*"la" + 0.008*"based" + 0.007*"students" + 0.007*"education" + 0.007*"symptoms" + 0.007*"en" + 0.006*"data" + 0.006*"model" + 0.006*"testing" + 0.006*"medicine" + 0.006*"network" + 0.005*"approach" + 0.005*"analysis" + 0.005*"prediction" + 0.005*"deep" + 0.004*"online" + 0.004*"methods" + 0.004*"longitudinal" + 0.004*"program" + 0.004*"media" + 0.004*"study" + 0.004*"depression" + 0.004*"intervention" + 0.004*"evaluation" + 0.003*"system" + 0.003*"machine" + 0.003*"detection" + 0.003*"social" + 0.003*"exploring" + 0.003*"monitoring" + 0.003*"stress" + 0.003*"diagnostic" + 0.003*"application" + 0.003*"remote" + 0.003*"disorder" + 0.003*"face" + 0.003*"science" + 0.003*"home" + 0.003*"models" + 0.003*"society" + 0.003*"distancing" + 0.003*"chinese" + 0.003*"et" + 0.003*"randomised" + 0.003*"artificial" + 0.003*"level" + 0.003*"predicting" + 0.003*"dental" + 0.003*"beha

In [10]:
pip install pyLDAvis

Note: you may need to restart the kernel to use updated packages.


In [10]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [11]:
pyLDAvis.display(vis_data)

In [16]:
pyLDAvis.save_html(vis_data, 'lda_visualization.html')