In [24]:
import spacy

import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pprint import pprint

sys.path.append("../")
from utils.parse_arxiv import parse_title
from workloads.keyword_extractor import *

from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /home/zyang37/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
file = open('../data/cnn_news/filtered_dataCNN.pickle', 'rb')
data = pickle.load(file)
file.close()

text = data['Article text'].iloc[10].strip()
text

'London (CNN Business)City dwellers are used to switching between apps to decide the best way to get from A to B. Is it quickest to get the train or the bus? What about a taxi or a city bike? Which provider has the nearest e-scooter?It can be inconvenient and time consuming. Which is why Finnish startup MaaS Global decided to aggregate all these services into one app called Whim. Available in more than 10 cities across Europe and Asia, users can access taxis, buses, bikes, e-scooters and rental cars. "Whim\'s sole purpose is to compete against car ownership," CEO Sampo Hietanen tells CNN Business.According to the International Energy Agency, transport is responsible for 24% of global energy-related CO2 emissions, most of which come from passenger vehicles. If Whim can persuade users to trade their car keys for a single app offering multiple transport options, the environmental impact could be enormous, says Hietanen. Car competitionRead MoreHe admits this isn\'t an easy task. To succee

In [8]:
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

# print()
# print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

print()
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['London', 'CNN Business)City dwellers', 'apps', 'the best way', 'A', 'B.', 'it', 'the train', 'the bus', 'a taxi', 'a city bike', 'Which provider', 'e', '-', 'scooter?It', 'Which', 'MaaS Global', 'all these services', 'one app', 'Whim', 'more than 10 cities', 'Europe', 'Asia', 'users', 'buses', 'bikes', 'e', '-', 'scooters', 'rental cars', "Whim's sole purpose", 'car ownership', 'CEO Sampo Hietanen', 'CNN Business', 'the International Energy Agency', 'transport', '24%', 'global energy-related CO2 emissions', 'which', 'passenger vehicles', 'Whim', 'users', 'their car', 'keys', 'a single app', 'multiple transport options', 'the environmental impact', 'Hietanen', 'Car competitionRead MoreHe', 'this', 'an easy task', 'Whim', 'a car', 'The car', 'freedom', 'mobility', 'Hietanen', 'a city dweller', 'it', 'they', 'it', 'a "freedom insurance', '"To', 'Whim', 'rental cars', 'taxis', 'Hietanen', 'users', 'public transport', 'micromobility', 'lightweight vehicles', 'bikes', 'e', '-

In [46]:
# a func that compute a TF-IDF score for noun chunk with respect to the article
def compute_tfidf_score(article, noun_chunk):
    # loop each token in the noun chunk, compute the tf-idf score, 
    # then return a average score
    tfidf_scores = []
    for token in word_tokenize(noun_chunk):
        tf = article.count(token) / len(article)
        try:
            idf = np.log(len(article) / article.count(token))
        except:
            # if the token is not in the article, skip
            continue
        tfidf_scores.append(tf * idf)
    return np.mean(tfidf_scores)

In [54]:
noun_chunk_list = [chunk.text for chunk in doc.noun_chunks]
noun_chunk_list = list(set(noun_chunk_list))
# remove strings that has less than 3 tokens
noun_chunk_list = [noun_chunk for noun_chunk in noun_chunk_list if len(word_tokenize(noun_chunk)) > 2]
noun_chunk_list = remove_noise_from_keywords(noun_chunk_list)

# sort the noun_chunk_list by length, make [(score, word), ...]
noun_chunk_list = [(compute_tfidf_score(text, noun_chunk)*len(noun_chunk), noun_chunk) for noun_chunk in noun_chunk_list]
# noun_chunk_list = [(len(noun_chunk), noun_chunk) for noun_chunk in noun_chunk_list]
noun_chunk_list.sort(reverse=True)


# print the top 10 noun chunks
for i in range(20):
    print(noun_chunk_list[i])

(1.2002119693478979, 'a wide public transport system')
(1.1892532075776696, 'a growing world population')
(0.9893733511552046, 'a company survey')
(0.8666137178165841, 'a city dweller')
(0.8165220785701209, 'a travel pass')
(0.7808617292637576, 'a single app')
(0.6862940752284352, 'a city bike')
(0.4782287442289171, 'the global mobility service market')
(0.4322144572483626, 'the International Transport Forum')
(0.4060196416575528, 'the International Energy Agency')
(0.4005554525773901, 'the more sustainable solution')
(0.39121906971601206, 'the environmental impact')
(0.3810195888775107, 'the earliest providers')
(0.35861748057301107, 'the complex technology')
(0.31174329999069056, 'the market matures')
(0.28275346614601493, 'the most popular ways')
(0.2780196156238534, 'several European and Asian cities')
(0.27226698116395825, 'an easy task')
(0.22821112400100704, 'the first step')
(0.20782886666046035, 'the best way')


In [64]:
def extract_keywords_spacy(text: str, score: bool = True, min_token: int = 3):
    text = text.strip()
    doc = nlp(text)
    noun_chunk_list = list(set([chunk.text for chunk in doc.noun_chunks]))
    noun_chunk_list = [noun_chunk for noun_chunk in noun_chunk_list if len(word_tokenize(noun_chunk)) > (min_token-1)]
    noun_chunk_list = remove_noise_from_keywords(noun_chunk_list)

    # sort the noun_chunk_list by length, make [(score, word), ...]
    noun_chunk_list = [(compute_tfidf_score(text, noun_chunk)*len(noun_chunk), noun_chunk) for noun_chunk in noun_chunk_list]
    # noun_chunk_list = [(len(noun_chunk), noun_chunk) for noun_chunk in noun_chunk_list]
    noun_chunk_list.sort(reverse=True)
    if score:
        return noun_chunk_list
    return [noun_chunk[1] for noun_chunk in noun_chunk_list]

In [65]:
extract_keywords_spacy(text, score=True, min_token=3)

[(1.2002119693478979, 'a wide public transport system'),
 (1.1892532075776696, 'a growing world population'),
 (0.9893733511552046, 'a company survey'),
 (0.8666137178165841, 'a city dweller'),
 (0.8165220785701209, 'a travel pass'),
 (0.7808617292637576, 'a single app'),
 (0.6862940752284352, 'a city bike'),
 (0.4782287442289171, 'the global mobility service market'),
 (0.4322144572483626, 'the International Transport Forum'),
 (0.4060196416575528, 'the International Energy Agency'),
 (0.4005554525773901, 'the more sustainable solution'),
 (0.39121906971601206, 'the environmental impact'),
 (0.3810195888775107, 'the earliest providers'),
 (0.35861748057301107, 'the complex technology'),
 (0.31174329999069056, 'the market matures'),
 (0.28275346614601493, 'the most popular ways'),
 (0.2780196156238534, 'several European and Asian cities'),
 (0.27226698116395825, 'an easy task'),
 (0.22821112400100704, 'the first step'),
 (0.20782886666046035, 'the best way'),
 (0.2036757512408785, 'alt