# Text Rank

https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0#:~:text=TextRank%20is%20an%20algorithm%20based,Extraction%20with%20TextRank%2C%20NER%2C%20etc

In [10]:
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


You should consider upgrading via the 'C:\Users\Tay\AppData\Local\Programs\Python\Python37\python.exe -m pip install --upgrade pip' command.


In [11]:
import spacy

In [15]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [16]:
content = '''
The Wan--dering Earth, described as China’s first big-budget science fiction thriller, quietly made it onto screens at AMC theaters in North America this weekend, and it shows a new side of Chinese filmmaking — one focused toward futuristic spectacles rather than China’s traditionally grand, massive historical epics. At the same time, The Wandering Earth feels like a throwback to a few familiar eras of American filmmaking. While the film’s cast, setting, and tone are all Chinese, longtime science fiction fans are going to see a lot on the screen that reminds them of other movies, for better or worse.
'''
doc = nlp(content)
for sents in doc.sents:
    print(sents.text)


The Wandering Earth, described as China’s first big-budget science fiction thriller, quietly made it onto screens at AMC theaters in North America this weekend, and it shows a new side of Chinese filmmaking —
one focused toward futuristic spectacles rather than China’s traditionally grand, massive historical epics.
At the same time, The Wandering Earth feels like a throwback to a few familiar eras of American filmmaking.
While the film’s cast, setting, and tone are all Chinese, longtime science fiction fans are going to see a lot on the screen that reminds them of other movies, for better or worse.



In [17]:
candidate_pos = ['NOUN', 'PROPN', 'VERB']
sentences = []

for sent in doc.sents:
    selected_words = []
    for token in sent:
        if token.pos_ in candidate_pos and token.is_stop is False:
            selected_words.append(token)
    sentences.append(selected_words)

print(sentences)

[[Wandering, Earth, described, China, budget, science, fiction, thriller, screens, AMC, theaters, North, America, weekend, shows, filmmaking], [focused, spectacles, China, epics], [time, Wandering, Earth, feels, throwback, eras, filmmaking], [film, cast, setting, tone, science, fiction, fans, going, lot, screen, reminds, movies]]


In [19]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in candidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [21]:
text = '''
The Wandering Earth, described as China’s first big-budget science fiction thriller, quietly made it onto screens at AMC theaters in North America this weekend, and it shows a new side of Chinese filmmaking — one focused toward futuristic spectacles rather than China’s traditionally grand, massive historical epics. At the same time, The Wandering Earth feels like a throwback to a few familiar eras of American filmmaking. While the film’s cast, setting, and tone are all Chinese, longtime science fiction fans are going to see a lot on the screen that reminds them of other movies, for better or worse.
'''

tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

science - 1.798260689279155
fiction - 1.779131473009314
China - 1.4462558888396955
Earth - 1.4059858425980583
filmmaking - 1.1017439574314574
tone - 1.1013378017184836
fans - 1.1013378017184836
Wandering - 1.0937062139249636
budget - 1.049677380542437
North - 1.0467124368686869
theaters - 1.005387626262626
AMC - 1.0015524235865145


Extract Wikipedia page

In [22]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import time
import re

In [24]:
chrome_driver_path = r'C:\Users\Tay\Downloads\Installers and Applications\chromedriver_win32\chromedriver.exe'
# chrome_driver_path = r'C:\Users\tay.yq.XTRAMAN\Downloads\Installers and Applications\chromedriver_win32\chromedriver.exe'

In [25]:
# Creation of a new instance of Chrome
driver = webdriver.Chrome(executable_path=chrome_driver_path)

In [26]:
def scrapeWikiPage(url):
    # scroll through the whole page in 5 scrolls
    def scroll_page():
        for i in range(5):
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight"
                + " * " + str(i+1) + "/5);")
            
    driver.get(url)
    time.sleep(2)
    scroll_page()
    
    corpus = driver.find_element_by_id("mw-content-text").text
    
    return corpus

**Various wikipedia pages:**
- Donald Trump: https://en.wikipedia.org/wiki/Donald_Trump
- Xi Jinping: https://en.wikipedia.org/wiki/Xi_Jinping

In [32]:
wikiCorpus = scrapeWikiPage(url=r"https://en.wikipedia.org/wiki/Donald_Trump")

In [33]:
def cleanCorpus(corpus):
    cleanedCorpus = corpus
    # replace newline character(s) with 1 blank space each
#     cleanedCorpus = re.sub('(\n+)', ' ', cleanedCorpus)
    # Remove words in round brackets ()
#     cleanedCorpus = re.sub('(\(.+\))', '', cleanedCorpus)
    # Remove words in square brackets []
    cleanedCorpus = re.sub('(\[.*\])', '', cleanedCorpus)
    
    return cleanedCorpus

In [34]:
cleanWikiCorpus = cleanCorpus(wikiCorpus)

In [35]:
cleanWikiCorpus



In [None]:
text = '''
The Wandering Earth, described as China’s first big-budget science fiction thriller, quietly made it onto screens at AMC theaters in North America this weekend, and it shows a new side of Chinese filmmaking — one focused toward futuristic spectacles rather than China’s traditionally grand, massive historical epics. At the same time, The Wandering Earth feels like a throwback to a few familiar eras of American filmmaking. While the film’s cast, setting, and tone are all Chinese, longtime science fiction fans are going to see a lot on the screen that reminds them of other movies, for better or worse.
'''

In [36]:
tr4w = TextRank4Keyword()
tr4w.analyze(cleanWikiCorpus, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Trump - 160.76885331429446
Donald - 38.26458924455292
January - 23.40583844066193
March - 23.130308279832644
U.S. - 21.76986386618425
June - 21.428974594771855
April - 19.389351469983858
July - 19.029426469698567
b - 18.868967458064905
October - 18.464923416527576
September - 17.377986567221917
^ - 16.939919000663146
