In [44]:
import pandas as pd
import pickle
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint

In [2]:
with open('meta_words', 'rb') as file:
    meta_data = pickle.load(file)

df = pd.DataFrame(meta_data)
df.columns

Index(['cord_uid', 'title', 'abs_n', 'abs_v', 'abs_aj', 'abs_av'], dtype='object')

In [43]:
df.shape

(389899, 17)

In [3]:
df['cord_uid'].head()

0    ug7v899j
1    02tnwd4m
2    ejv2xln0
3    2b73a28n
4    9785vg6d
Name: cord_uid, dtype: object

In [4]:
df['title'].head()

0    [clinical, features, culture-proven, mycoplasm...
1    [nitric, oxide, pro-inflammatory, mediator, lu...
2    [surfactant, protein-d, pulmonary, host, defense]
3                  [role, endothelin-1, lung, disease]
4    [gene, expression, epithelial, cells, response...
Name: title, dtype: object

In [12]:
#corpus[1:5]

[[(13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(19, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(13, 1), (14, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]]

In [19]:
#lda_model1 = LdaModel(corpus, id2word=dictionary, num_topics=2, passes=1)
#pprint(lda_model1.print_topics())

[(0,
  '0.026*"covid-19" + 0.022*"sars-cov-2" + 0.021*"patients" + '
  '0.011*"infection" + 0.010*"disease" + 0.009*"coronavirus" + '
  '0.007*"respiratory" + 0.007*"clinical" + 0.006*"virus" + 0.006*"acute"'),
 (1,
  '0.051*"covid-19" + 0.023*"pandemic" + 0.013*"health" + 0.012*"study" + '
  '0.008*"care" + 0.008*"impact" + 0.007*"among" + 0.005*"covid‐19" + '
  '0.005*"analysis" + 0.004*"social"')]


In [7]:
from gensim.models import Phrases
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
import re

In [8]:
wnl = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")

In [9]:
with open('valid_metadata', 'rb') as file:
    # Use pickle to load the object from the file
    df = pickle.load(file)
    
def clean_text(text):
    # Remove brackets 
    cleaned_text = re.sub(r'[\(\)]', '', text)
    return cleaned_text

df['cleaned'] = df['title'].apply(lambda x: clean_text(x) if pd.notnull(x) else [])
df['cleaned'].head()

df['words'] = df['cleaned'].apply(lambda x: nltk.word_tokenize(str(x).lower()))
df['words'].head()

0    [clinical, features, of, culture-proven, mycop...
1    [nitric, oxide, :, a, pro-inflammatory, mediat...
2    [surfactant, protein-d, and, pulmonary, host, ...
3          [role, of, endothelin-1, in, lung, disease]
4    [gene, expression, in, epithelial, cells, in, ...
Name: words, dtype: object

In [29]:
#print(df['words'].dtype)

object


In [10]:
df['words'] = df['words'].apply(lambda word_list: [wnl.lemmatize(word) for word in word_list if word.isalpha() and word not in stopwords])
df['words'].head()

0    [clinical, feature, mycoplasma, pneumoniae, in...
1             [nitric, oxide, mediator, lung, disease]
2               [surfactant, pulmonary, host, defense]
3                                [role, lung, disease]
4    [gene, expression, epithelial, cell, response,...
Name: words, dtype: object

In [16]:
df['bigrams'] = df['words'].apply(lambda x: list(nltk.bigrams(x)))

In [17]:
df['bigrams'].head()

0    [(clinical, feature), (feature, mycoplasma), (...
1    [(nitric, oxide), (oxide, mediator), (mediator...
2    [(surfactant, pulmonary), (pulmonary, host), (...
3                      [(role, lung), (lung, disease)]
4    [(gene, expression), (expression, epithelial),...
Name: bigrams, dtype: object

In [18]:
df['trigrams'] = df['words'].apply(lambda x: list(nltk.trigrams(x)))

In [20]:
df['trigrams'].head()

0    [(clinical, feature, mycoplasma), (feature, my...
1    [(nitric, oxide, mediator), (oxide, mediator, ...
2    [(surfactant, pulmonary, host), (pulmonary, ho...
3                              [(role, lung, disease)]
4    [(gene, expression, epithelial), (expression, ...
Name: trigrams, dtype: object

In [24]:
print(df['trigrams'].dtype)

object


In [31]:
type(df['trigrams'][1][1])

tuple

In [33]:
# bigrams to check
bi = [('infectious', 'disease'), ('virus', 'infection'), ('viral', 'infection')]

# Check if any bigram is in the list in each row
df['bi_in_title'] = df['bigrams'].apply(lambda x: any(b in x for b in bi))

In [34]:
sum(df['bi_in_title'])

5133

In [35]:
df['bi_in_title'].head()

0    False
1    False
2    False
3    False
4    False
Name: bi_in_title, dtype: bool

In [37]:
df[df['bi_in_title']==True].head()

Unnamed: 0,cord_uid,abstract,title,pdf_json_files,pmc_json_files,cleaned,words,bigrams,trigrams,bi_in_title
13,mcuixluu,We examined the role of the microtubule cytosk...,Vaccinia virus infection disrupts microtubule ...,document_parses/pdf_json/44102e3e69e70ad2a73e7...,document_parses/pmc_json/PMC306617.xml.json,Vaccinia virus infection disrupts microtubule ...,"[vaccinia, virus, infection, disrupts, microtu...","[(vaccinia, virus), (virus, infection), (infec...","[(vaccinia, virus, infection), (virus, infecti...",True
20,wnnsmx60,In the 1980's and 1990's HIV/AIDS was the emer...,Managing emerging infectious diseases: Is a fe...,document_parses/pdf_json/025339bfce1cb8efa81c5...,document_parses/pmc_json/PMC544965.xml.json,Managing emerging infectious diseases: Is a fe...,"[managing, emerging, infectious, disease, fede...","[(managing, emerging), (emerging, infectious),...","[(managing, emerging, infectious), (emerging, ...",True
83,1y5nej0m,Prophylaxis with high doses of neutralizing an...,Neutralizing Antibody Fails to Impact the Cour...,document_parses/pdf_json/b6353f8b0fcd86c2fd1e6...,document_parses/pmc_json/PMC1779296.xml.json,Neutralizing Antibody Fails to Impact the Cour...,"[neutralizing, antibody, fails, impact, course...","[(neutralizing, antibody), (antibody, fails), ...","[(neutralizing, antibody, fails), (antibody, f...",True
121,6lezilfv,BACKGROUND: Despite the seriousness of dengue-...,Host Gene Expression Profiling of Dengue Virus...,document_parses/pdf_json/63008713691bdb1d8eed0...,document_parses/pmc_json/PMC2100376.xml.json,Host Gene Expression Profiling of Dengue Virus...,"[host, gene, expression, profiling, dengue, vi...","[(host, gene), (gene, expression), (expression...","[(host, gene, expression), (gene, expression, ...",True
146,1dus0u4m,"BACKGROUND: AIDS, SARS, and the recent epidemi...","Can ""presumed consent"" justify the duty to tre...",document_parses/pdf_json/07abbba28dd64fdb2f811...,document_parses/pmc_json/PMC2311313.xml.json,"Can ""presumed consent"" justify the duty to tre...","[presumed, consent, justify, duty, treat, infe...","[(presumed, consent), (consent, justify), (jus...","[(presumed, consent, justify), (consent, justi...",True


In [38]:
# trigrams to check
tri = [('respiratory', 'tract', 'infection'), ('infectious', 'bronchitis', 'virus'),('infection', 'case', 'report'),('acute', 'respiratory', 'infection')]

# Check if any trigram is in the list in each row
df['tri_in_title'] = df['trigrams'].apply(lambda x: any(tr in x for tr in tri))

In [39]:
# do abstracts cleaning
df['cleaned_abs'] = df['abstract'].apply(lambda x: clean_text(x) if pd.notnull(x) else [])

df['words_abs'] = df['cleaned_abs'].apply(lambda x: nltk.word_tokenize(str(x).lower()))
    
df['words_abs'] = df['words_abs'].apply(lambda word_list: [wnl.lemmatize(word) for word in word_list if word.isalpha() and word not in stopwords])

df['bigrams_abs'] = df['words_abs'].apply(lambda x: list(nltk.bigrams(x)))
df['trigrams_abs'] = df['words_abs'].apply(lambda x: list(nltk.trigrams(x)))

In [40]:
# bigrams to check
bi_abs = [('infectious', 'disease'), ('viral', 'infection')]

# Check if any bigram is in the list in each row
df['bi_in_abs'] = df['bigrams_abs'].apply(lambda x: any(b in x for b in bi_abs))

# trigrams to check
tri_abs = [('respiratory', 'tract', 'infection'), ('syndrome', 'coronavirus', 'infection')]

# Check if any trigram is in the list in each row
df['tri_in_abs'] = df['trigrams_abs'].apply(lambda x: any(tr in x for tr in tri_abs))

In [41]:
#final selection
df_sub = df[df['bi_in_abs']|df['bi_in_title']|df['tri_in_abs']|df['tri_in_title']]

In [42]:
df_sub.shape

(27362, 17)

In [46]:
df_sub.columns

Index(['cord_uid', 'abstract', 'title', 'pdf_json_files', 'pmc_json_files',
       'cleaned', 'words', 'bigrams', 'trigrams', 'bi_in_title',
       'tri_in_title', 'cleaned_abs', 'words_abs', 'bigrams_abs',
       'trigrams_abs', 'bi_in_abs', 'tri_in_abs'],
      dtype='object')

In [48]:
df_ID = df[['cord_uid','pdf_json_files','pmc_json_files']]

In [49]:
df_ID.head()

Unnamed: 0,cord_uid,pdf_json_files,pmc_json_files
0,ug7v899j,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json
1,02tnwd4m,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json
2,ejv2xln0,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json
3,2b73a28n,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json
4,9785vg6d,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json


In [45]:
with open('df_Nov21.pkl', 'wb') as f:
    pickle.dump(df_sub, f)

In [50]:
with open('ID_Nov21.pkl', 'wb') as f:
    pickle.dump(df_ID, f)

In [38]:
#all_words = [word for sublist in df['words'] for word in sublist]
#all_words[:5]

['clinical', 'feature', 'mycoplasma', 'pneumoniae', 'infection']

In [41]:
#bigrams = list(nltk.bigrams(all_words))

In [42]:
#bigrams[1:5]

[('feature', 'mycoplasma'),
 ('mycoplasma', 'pneumoniae'),
 ('pneumoniae', 'infection'),
 ('infection', 'king')]

In [44]:
# Create a dictionary and a corpus
#dictionary = corpora.Dictionary(bigrams)
#corpus = [dictionary.doc2bow(doc) for doc in bigrams]

In [52]:
#corpus[1:5]

[[(1, 1), (2, 1)], [(2, 1), (3, 1)], [(3, 1), (4, 1)], [(4, 1), (5, 1)]]

In [54]:
#lda_model2 = LdaModel(corpus, id2word=dictionary, num_topics=2, passes=10)

In [55]:
#pprint(lda_model2.print_topics())

[(0,
  '0.027*"patient" + 0.016*"disease" + 0.016*"infection" + 0.015*"health" + '
  '0.012*"coronavirus" + 0.010*"care" + 0.009*"clinical" + 0.008*"among" + '
  '0.008*"respiratory" + 0.007*"response"'),
 (1,
  '0.029*"pandemic" + 0.023*"study" + 0.011*"case" + 0.011*"review" + '
  '0.011*"impact" + 0.010*"analysis" + 0.009*"virus" + 0.008*"effect" + '
  '0.007*"using" + 0.006*"treatment"')]
