In [1]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_sm')

In [9]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        keyword_list = []
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            keyword_list.append(key)
            #print(key + ' - ' + str(value))
            if i > number:
                return keyword_list
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [10]:
# tr4w = TextRank4Keyword()
# text = '''BACKGROUND: Rhinovirus, the most common cause of upper respiratory tract infections, has been implicated in asthma exacerbations and possibly asthma deaths. Although the method of transmission of rhinoviruses is disputed, several studies have demonstrated that aerosol transmission is a likely method of transmission among adults. As a first step in studies of possible airborne rhinovirus transmission, we developed methods to detect aerosolized rhinovirus by extending existing technology for detecting infectious agents in nasal specimens. METHODS: We aerosolized rhinovirus in a small aerosol chamber. Experiments were conducted with decreasing concentrations of rhinovirus. To determine the effect of UV irradiation on detection of rhinoviral aerosols, we also conducted experiments in which we exposed aerosols to a UV dose of 684 mJ/m(2). Aerosols were collected on Teflon filters and rhinovirus recovered in Qiagen AVL buffer using the Qiagen QIAamp Viral RNA Kit (Qiagen Corp., Valencia, California) followed by semi-nested RT-PCR and detection by gel electrophoresis. RESULTS: We obtained positive results from filter samples that had collected at least 1.3 TCID(50 )of aerosolized rhinovirus. Ultraviolet irradiation of airborne virus at doses much greater than those used in upper-room UV germicidal irradiation applications did not inhibit subsequent detection with the RT-PCR assay. CONCLUSION: The air sampling and extraction methodology developed in this study should be applicable to the detection of rhinovirus and other airborne viruses in the indoor air of offices and schools. This method, however, cannot distinguish UV inactivated virus from infectious viral particles.'''
# tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
# snig_list = []
# snig_list = tr4w.get_keywords(10)
# print(snig_list)

['rhinovirus', 'detection', 'Qiagen', 'transmission', 'UV', 'irradiation', 'method', 'RT', 'aerosols', 'air', 'studies', 'PCR']


In [11]:
dataset = pd.read_csv('info_ret_textrank_new.csv')


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41549 entries, 0 to 41548
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    41549 non-null  int64 
 1   title         41549 non-null  object
 2   abstract      41549 non-null  object
 3   publish_time  41549 non-null  object
 4   authors       41549 non-null  object
 5   url           41549 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.9+ MB


In [14]:
dataset['text'] = dataset['title'] + dataset['abstract']
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41549 entries, 0 to 41548
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    41549 non-null  int64 
 1   title         41549 non-null  object
 2   abstract      41549 non-null  object
 3   publish_time  41549 non-null  object
 4   authors       41549 non-null  object
 5   url           41549 non-null  object
 6   text          41549 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.2+ MB


In [15]:
dataset = np.array(dataset)

In [47]:
final_list = []
temp_list = []
tr4w = TextRank4Keyword()
for i in range (0, 41549):
    x = dataset[i,6]
    tr4w.analyze(str(x), candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    temp_list = tr4w.get_keywords(10)
    if temp_list is None:
        temp_list = tr4w.get_keywords(5)
    final_list.append(temp_list)



In [48]:
print(final_list[41548])

['muscle', 'eIF3f', 'MAFbx', 'initiation', 'hypertrophy', 'conditions', 'role', 'atrogin-1', 'atrophy', 'subunit', 'protein', 'translation']


In [49]:
dataset = pd.DataFrame(dataset)
dataset = dataset.assign(keywords=final_list)

In [50]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41549 entries, 0 to 41548
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   0         41549 non-null  object
 1   1         41549 non-null  object
 2   2         41549 non-null  object
 3   3         41549 non-null  object
 4   4         41549 non-null  object
 5   5         41549 non-null  object
 6   6         41549 non-null  object
 7   keywords  41461 non-null  object
dtypes: object(8)
memory usage: 2.5+ MB


In [51]:
dataset.to_csv('keywords.csv', index=False)

In [52]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,keywords
0,0,Airborne rhinovirus detection and effect of ul...,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Airborne rhinovirus detection and effect of ul...,"[rhinovirus, detection, Qiagen, transmission, ..."
1,1,Discovering human history from stomach bacteria,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Discovering human history from stomach bacteri...,"[pathogens, histories, Helicobacter, groups, p..."
2,2,A new recruit for the army of the men of death,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,A new recruit for the army of the men of death...,"[recruit, men, army, phrase, face, Bunyan, fear]"
3,3,Association of HLA class I with severe acute r...,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Association of HLA class I with severe acute r...,"[SARS, HLA, group, coronavirus, P, Pc, risk, I..."
4,4,A double epidemic model for the SARS propagation,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,A double epidemic model for the SARS propagati...,"[SARS, epidemic, model, spread, epidemics, dis..."
