In [2]:
import pandas as pd
import pickle

In [3]:
with open('meta_words', 'rb') as file:
    meta_data = pickle.load(file)

df = pd.DataFrame(meta_data)
df.columns

Index(['cord_uid', 'title', 'abs_n', 'abs_v', 'abs_aj', 'abs_av'], dtype='object')

In [4]:
df['abs_n'].head()

0    [objective, retrospective, chart, review, desc...
1    [inflammatory, disease, respiratory, tract, pr...
2    [surfactant, protein-d, sp-d, participates, in...
3    [endothelin-1, et-1, amino, acid, peptide, div...
4    [respiratory, virus, rsv, pneumonia, virus, mo...
Name: abs_n, dtype: object

In [5]:
from gensim import corpora, models
from pprint import pprint

In [6]:
data = df['abs_n'].apply(lambda x: [str(x)] if not isinstance(x, list) else [str(item) for item in x])
dictionary = corpora.Dictionary(data)

In [7]:
corpus = [dictionary.doc2bow(doc) for doc in data]

In [11]:
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [12]:
pprint(lda_model.print_topics())

[(0,
  '0.021*"model" + 0.011*"method" + 0.011*"data" + 0.009*"result" + '
  '0.009*"covid-19" + 0.007*"system" + 0.006*"number" + 0.006*"case" + '
  '0.006*"analysis" + 0.006*"study"'),
 (1,
  '0.053*"patient" + 0.037*"covid-19" + 0.015*"study" + 0.013*"disease" + '
  '0.010*"case" + 0.010*"risk" + 0.009*"group" + 0.009*"symptom" + '
  '0.009*"result" + 0.008*"treatment"'),
 (2,
  '0.024*"cell" + 0.015*"protein" + 0.010*"disease" + 0.010*"virus" + '
  '0.010*"infection" + 0.009*"drug" + 0.008*"immune" + 0.008*"response" + '
  '0.008*"study" + 0.007*"human"'),
 (3,
  '0.052*"sars-cov-2" + 0.029*"vaccine" + 0.025*"virus" + 0.023*"infection" + '
  '0.017*"antibody" + 0.017*"0" + 0.016*"respiratory" + 0.014*"coronavirus" + '
  '0.014*"vaccination" + 0.013*"de"'),
 (4,
  '0.021*"health" + 0.018*"covid-19" + 0.016*"pandemic" + 0.013*"study" + '
  '0.007*"care" + 0.007*"result" + 0.006*"data" + 0.006*"research" + '
  '0.006*"public" + 0.006*"impact"')]


In [10]:
topics = lda_model.print_topics(num_words=100)
for topic in topics:
    print(topic)

(0, '0.027*"cell" + 0.017*"protein" + 0.014*"0" + 0.013*"covid-19" + 0.010*"sars-cov-2" + 0.010*"drug" + 0.010*"virus" + 0.008*"immune" + 0.008*"human" + 0.007*"expression" + 0.007*"gene" + 0.007*"study" + 0.007*"activity" + 0.007*"response" + 0.006*"host" + 0.006*"receptor" + 0.006*"mechanism" + 0.006*"infection" + 0.006*"effect" + 0.006*"target" + 0.006*"role" + 0.005*"binding" + 0.005*"interaction" + 0.005*"ace2" + 0.005*"molecular" + 0.005*"rna" + 0.005*"mouse" + 0.004*"development" + 0.004*"pathway" + 0.004*"compound" + 0.004*"treatment" + 0.004*"system" + 0.004*"function" + 0.004*"replication" + 0.004*"inhibitor" + 0.004*"disease" + 0.004*"analysis" + 0.004*"structure" + 0.004*"acid" + 0.003*"coronavirus" + 0.003*"novel" + 0.003*"result" + 0.003*"tissue" + 0.003*"review" + 0.003*"spike" + 0.003*"surface" + 0.003*"vitro" + 0.003*"property" + 0.003*"molecule" + 0.003*"type" + 0.003*"enzyme" + 0.003*"activation" + 0.003*"process" + 0.003*"agent" + 0.003*"site" + 0.003*"found" + 0.00

In [13]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
