In [1]:
from allennlp.predictors.predictor import Predictor
from collections import namedtuple
import spacy
import math
import json
import tqdm

### allen nlp sentiment model

In [3]:
sentiment_model = "/Users/Xiao/Downloads/sst-roberta-large-2020.02.17.tar.gz"
# ner_model = "/Users/Xiao/Downloads/fine-grained-ner-model-elmo-2018.12.21.tar.gz"

In [4]:
sent = "Key Trump adviser claims tax cuts will still pay for themselves even as coronavirus drags on US economy"

In [5]:
sentiment_predictor = Predictor.from_path(sentiment_model)

In [6]:
sentiment_predictor.predict(sentence=sent)

{'logits': [-0.8061320781707764, -0.23963770270347595],
 'probs': [0.36204612255096436, 0.6379538774490356],
 'label': '0'}

In [7]:
def get_sentiment(text):
    pred = sentiment_predictor.predict(sentence=text)
    label = int(pred['label'])
    return -1 if label == 0 else 1

### Spacy ner model

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
doc = nlp(sent)

In [10]:
for ent in doc.ents:
    print(ent.text, ent.label_)

US GPE


In [11]:
def get_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ('ORG', 'GPE', 'NORP', 'FAC')]

def get_text_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ('ORG', 'GPE', 'NORP', 'FAC', 
                                                                         'PERSON', 'PRODUCT', 'MONEY', 'EVENT')]

### extract information

In [3]:
# class Item:
#     def __init__(title,section,pdate,weburl,apiurl,ents,sentiment,text):
#         self.title = title
#         self.section = section
#         self.pdate = pdate
#         self.weburl = weburl
#         self.apiurl = apiurl
#         self.ents = ents
#         self.sentiment = sentiment
#         self.text=text

Item = namedtuple('INFO', ('title', 'section', 'pdate', 'weburl', 'apiurl', 'text', 'ents', 'sentiment'))


def extract_info(text):
    entities = get_entities(text)
    sentiment = get_sentiment(text)
    return entities,sentiment

def analyse(news_list):
    news = []
    for title, section, date, weburl, apiurl,bodytext in tqdm.tqdm(news_list):
        entities, sentiment = extract_info(title)
        news.append(Item(title=title,
                         section=section,
                         pdate=date,
                         weburl=weburl,
                         apiurl=apiurl,
                         text=bodytext,
                         ents=entities, 
                         sentiment=sentiment,))
    return news

In [13]:
with open('news.json', 'r') as f:
    news_list = json.load(f)

In [14]:
analysis = analyse(news_list)

100%|██████████| 4435/4435 [31:39<00:00,  2.34it/s]    


In [15]:
# with open('news-text.json', 'w') as f:
#     json.dump(analysis, fp=f)

In [16]:
get_entities(analysis[300].text)

[('America', 'GPE'),
 ('Dow Jones', 'ORG'),
 ('Bear Market', 'ORG'),
 ('American', 'NORP'),
 ('the Federal Government', 'ORG'),
 ('CoronaVirus', 'ORG'),
 ('UK', 'GPE'),
 ('FTSE', 'ORG'),
 ('The Bank of England', 'ORG'),
 ('UK', 'GPE'),
 ('UK', 'GPE'),
 ('the European Central Bank', 'ORG'),
 ('EU', 'ORG'),
 ('Italy', 'GPE')]

In [18]:
analysis[10].text



### Fasttext embedding

In [229]:
from gensim.models.fasttext import FastText
import nltk

def process_sent(sent, remove_punct=True):
    if remove_punct:
        tokens = [token.lower() for token in nltk.word_tokenize(sent.strip()) if len(token)>1 or token.isalnum()]
    else:
        tokens = [token.lower() for token in nltk.word_tokenize(sent.strip())]
    return tokens
    
def flat_corpus(dataset, para_level=False, remove_punct=True):
    corpus = []
    sid2art = {}
    for aid, an in enumerate(tqdm.tqdm(dataset)):
        paragraphs = an.text.split("\n")
        if para_level:
            for pid, para in enumerate(paragraphs):
                start_idx = len(sid2art)
                sid2art[start_idx] = {'aid':aid, 'pid':pid}
                tokens = process_sent(para, remove_punct)
                corpus.append(tokens)           
        else:
            for pid, para in enumerate(paragraphs):
                sents = nltk.sent_tokenize(para)
                start_idx = len(sid2art)
                for i in range(len(sents)):
                    sid2art[start_idx+i] = {'aid':aid, 'pid':pid}
                    tokens = process_sent(sents[i], remove_punct)
                    corpus.append(tokens)
    return corpus, sid2art

In [230]:
with open('news-text-unique.json', 'r') as f:
    data = json.load(f)
    analysis = [Item._make(d) for d in data]

In [231]:
line_corpus, sid2aid = flat_corpus(analysis, para_level=False, remove_punct=True)

100%|██████████| 3171/3171 [00:31<00:00, 101.98it/s]


In [232]:
len(line_corpus), line_corpus[:10]

(113098,
 [['the',
   'wellcome',
   'trust',
   'is',
   'calling',
   'on',
   'big',
   'businesses',
   'to',
   'donate',
   '8bn',
   '£6.4bn',
   'for',
   'research',
   'into',
   'developing',
   'diagnostic',
   'tests',
   'therapies',
   'and',
   'vaccines',
   'to',
   'tackle',
   'the',
   'coronavirus',
   'pandemic'],
  ['jeremy',
   'farrar',
   'the',
   'director',
   'of',
   'the',
   'london-based',
   'medical',
   'research',
   'charity',
   'said',
   'a',
   'huge',
   'investment',
   'in',
   'scientific',
   'research',
   'was',
   'the',
   'only',
   'exit',
   'strategy',
   'to',
   'save',
   'millions',
   'of',
   'lives',
   'and',
   'drag',
   'the',
   'global',
   'economy',
   'out',
   'of',
   'an',
   'inevitable',
   'recession'],
  ['farrar',
   'said',
   'that',
   'while',
   'businesses',
   'and',
   'governments',
   'had',
   'acted',
   'fast',
   'to',
   'help',
   'support',
   'staff',
   'and',
   'keep',
   'the',
   'gl

In [223]:
ft_model = FastText(
    sentences=line_corpus, 
    size=100,
    window=10,
    min_count=5,
    word_ngrams=1,
    iter=5,
    negative=5,
    sample=1e-3,
    sg=0,
    alpha=0.025,
    ns_exponent=0.75,
    min_n=3,
    max_n=6)

In [224]:
ft_model.wv.similar_by_word('covid')

[('covid‑19', 0.9601863622665405),
 ('covid-19.', 0.9178797602653503),
 ('covid-19', 0.8939427733421326),
 ('covid-19-related', 0.8484106063842773),
 ('anti-coronavirus', 0.8391342759132385),
 ('coronavirus', 0.8288556337356567),
 ('coronavirus.', 0.8244268298149109),
 ('coronavirus-driven', 0.8238552808761597),
 ('coronavirus-hit', 0.8180310726165771),
 ('coronaviruses', 0.8164380788803101)]

### Sentence embedding

In [225]:
from fse.models import SIF
from fse import IndexedList

In [226]:
fse_model = SIF(ft_model)

In [227]:
indexed_corpus = IndexedList(line_corpus)

In [228]:
fse_model.train(indexed_corpus)

(113067, 2678085)

In [313]:
query = "human right covid-19"
query = ' '.join(process_sent(query, remove_punct=True))
retrieved_sents = fse_model.sv.similar_by_sentence(nltk.word_tokenize(query), model=fse_model, indexable=indexed_corpus)

In [314]:
retrieved_sents[:5]

[((['basic',
    'compassion',
    'empathy',
    'and',
    'kindness',
    'for',
    'our',
    'fellow',
    'human',
    'beings',
    'right',
    'now',
    'this',
    'is',
    'what',
    'we',
    'can',
    'and',
    'should',
    'be',
    'spreading'],
   42318),
  42318,
  0.7477512359619141),
 ((['but',
    'vulnerable',
    'people',
    'should',
    'not',
    'be',
    'exposed',
    'to',
    'a',
    'virus',
    'right',
    'now',
    'in',
    'the',
    'service',
    'of',
    'a',
    'hypothetical',
    'future'],
   68915),
  68915,
  0.7323046922683716),
 ((['the',
    'directive',
    'states',
    'its',
    'purpose',
    'is',
    'to',
    'facilitate',
    'the',
    'most',
    'efficient',
    'and',
    'effective',
    'deployment',
    'of',
    'aps',
    'employees',
    'and',
    'expertise',
    'to',
    'meet',
    'the',
    'exceptional',
    'challenge',
    'posed',
    'by',
    'covid-19',
    'to',
    'australian',
    'society'

In [315]:
retrieved_sids = [s[1] for s in retrieved_sents]

In [316]:
retrieved_sids

[42318, 68915, 68960, 51595, 96286, 68067, 21769, 29441, 9820, 52029]

In [317]:
mapping = [sid2aid[sid] for sid in retrieved_sids]

In [318]:
mapping

[{'aid': 1258, 'pid': 2},
 {'aid': 1998, 'pid': 2},
 {'aid': 1999, 'pid': 2},
 {'aid': 1511, 'pid': 0},
 {'aid': 2777, 'pid': 1},
 {'aid': 1974, 'pid': 0},
 {'aid': 670, 'pid': 4},
 {'aid': 891, 'pid': 4},
 {'aid': 321, 'pid': 2},
 {'aid': 1522, 'pid': 1}]

In [319]:
def retrieve_paragraph(mappings, dataset):
    paras = []
    for m in mappings:
        article = dataset[m['aid']]
        paragraph = article.text.split('\n')[m['pid']]
        paras.append(paragraph)
    return paras

In [320]:
r_paras = retrieve_paragraph(mapping, analysis)

In [321]:
r_paras

['To this day, I’m still hearing stories from my students – we have been Skyping regularly – about how people walk in the other direction when they see them. Sometimes strangers get aggressive and say things to them like: “This is your people’s fault.” They’re afraid to walk into places like Walmart or Target because of this. I fear for their safety beyond the coronavirus itself. One student, whose husband and kids are here in the US with her, just told me a story about going to a PTA meeting at her son’s elementary school. She and her husband sat in the back corner of the room so as not to upset or alarm other parents – it has happened so often, they have come to expect it. She told me other parents approached the person running the meeting and asked for “that Chinese couple in the back” to be asked to leave. Thankfully the request was not granted, but my student was still deeply upset by it, understandably. We are all having to go into isolation because of Covid-19, through no fault 

### TFIDF similarity

In [254]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import DistanceMetric
from sklearn import metrics
import numpy as np

In [255]:
rs = retrieved_sents[:100]

In [339]:
tfidf_vectorizer = TfidfVectorizer(binary=True, sublinear_tf=True, norm='l2', ngram_range=(1,2))

In [334]:
para_corpus, pid2aid = flat_corpus(analysis, para_level=True, remove_punct=True)

100%|██████████| 3171/3171 [00:23<00:00, 136.68it/s]


In [340]:
tfidf_vectorizer.fit([' '.join(line) for line in para_corpus])

TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [341]:
trans_rs = tfidf_vectorizer.transform([query]+[' '.join(line[0][0]) for line in retrieved_sents])
trans_para = tfidf_vectorizer.transform([query]+[' '.join(process_sent(para)) for para in r_paras])

In [342]:
np.array([query]+[' '.join(line[0][0]) for line in retrieved_sents])[[3,4,5,6,8]]

array(['the directive states its purpose is to facilitate the most efficient and effective deployment of aps employees and expertise to meet the exceptional challenge posed by covid-19 to australian society a task which has become the principal focus of aps endeavour',
       'are we doing covid-19 right',
       'the increasing concern from climate researchers that such a world is likely to be climatically unstable and incompatible with the survival of human civilisation as we know it is totally ignored',
       'this virus has exposed the fragility of our society',
       'is our destruction of nature responsible for covid-19'],
      dtype='<U259')

In [343]:
for i,t in enumerate([' '.join(line[0][0]) for line in retrieved_sents]):
    print(i+1, t)

1 basic compassion empathy and kindness for our fellow human beings right now this is what we can and should be spreading
2 but vulnerable people should not be exposed to a virus right now in the service of a hypothetical future
3 the directive states its purpose is to facilitate the most efficient and effective deployment of aps employees and expertise to meet the exceptional challenge posed by covid-19 to australian society a task which has become the principal focus of aps endeavour
4 are we doing covid-19 right
5 the increasing concern from climate researchers that such a world is likely to be climatically unstable and incompatible with the survival of human civilisation as we know it is totally ignored
6 this virus has exposed the fragility of our society
7 we believe that this is the right tool at the right time to save lives reeves said
8 is our destruction of nature responsible for covid-19
9 right now the red cross has a severe blood shortage and donors are needed urgently
10 

In [344]:
dense = trans_rs.todense()
dist = DistanceMetric.get_metric('euclidean')
dist.pairwise(dense[0], dense[1:])

array([[1.3651187 , 1.39079327, 1.38966265, 1.29959675, 1.3908998 ,
        1.41421356, 1.38943851, 1.35148233, 1.39251039, 1.2937802 ]])

In [345]:
metrics.pairwise.cosine_similarity(dense[0], dense[1:])

array([[0.06822547, 0.03284704, 0.03441886, 0.15552414, 0.03269888,
        0.        , 0.03473031, 0.08674775, 0.03045741, 0.1630664 ]])

In [346]:
dense = trans_para.todense()
dist = DistanceMetric.get_metric('euclidean')
dist.pairwise(dense[0], dense[1:])

array([[1.39167124, 1.41023711, 1.40642906, 1.392981  , 1.40863472,
        1.40490986, 1.39680641, 1.40557799, 1.41015161, 1.36621896]])

In [347]:
sims = metrics.pairwise.cosine_similarity(dense[0], dense[1:])

In [348]:
sims

array([[0.03162558, 0.00561565, 0.01097865, 0.02980197, 0.00787412,
        0.01311414, 0.02446593, 0.01217526, 0.00573622, 0.06672287]])

In [354]:
sorted_rank = np.argsort(-sims[0])
for i, sent in enumerate(np.array(retrieved_sents)[sorted_rank]):
    print("{} {}".format(i+1, ' '.join(sent[0][0])))

1 that is a principle enshrined in un human rights and is a basic international human right as observed through human instruments and the international covenant of civil and political rights iccpr said contesse
2 basic compassion empathy and kindness for our fellow human beings right now this is what we can and should be spreading
3 are we doing covid-19 right
4 we believe that this is the right tool at the right time to save lives reeves said
5 this virus has exposed the fragility of our society
6 is our destruction of nature responsible for covid-19
7 the directive states its purpose is to facilitate the most efficient and effective deployment of aps employees and expertise to meet the exceptional challenge posed by covid-19 to australian society a task which has become the principal focus of aps endeavour
8 the increasing concern from climate researchers that such a world is likely to be climatically unstable and incompatible with the survival of human civilisation as we know it is 

In [149]:
count_vectorizer = CountVectorizer(binary=True)

In [150]:
count_vectorizer.fit([' '.join(line) for line in line_corpus])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [151]:
count_rs = count_vectorizer.transform([query]+[' '.join(line[0][0]) for line in retrieved_sents])

In [152]:
dense = count_rs.todense()
dist = DistanceMetric.get_metric('euclidean')
dist.pairwise(dense[0], dense[1:])

array([[2.23606798, 3.46410162, 3.74165739, 3.31662479, 4.89897949,
        2.23606798, 4.24264069, 4.24264069, 5.09901951, 5.74456265]])

In [153]:
query

'economic impacts of covid-19'

In [154]:
metrics.pairwise.cosine_similarity(dense[0], dense[1:])

array([[0.31622777, 0.1490712 , 0.13483997, 0.28284271, 0.18650096,
        0.31622777, 0.30779351, 0.30779351, 0.33218192, 0.15811388]])